diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,227948 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 23984, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "loss": 21.045429229736328, + "step": 0 + }, + { + "ce_loss": 3.982180595397949, + "epoch": 0, + "step": 0 + }, + { + "distill_loss": 1.7960407733917236, + "epoch": 0, + "step": 0 + }, + { + "epoch": 0, + "ref_ce_loss": 2.866819381713867, + "step": 0 + }, + { + "epoch": 0, + "loss": 20.749279022216797, + "step": 0 + }, + { + "ce_loss": 4.234129905700684, + "epoch": 0, + "step": 0 + }, + { + "distill_loss": 1.7374516725540161, + "epoch": 0, + "step": 0 + }, + { + "epoch": 0, + "ref_ce_loss": 3.2119572162628174, + "step": 0 + }, + { + "epoch": 0, + "loss": 19.981489181518555, + "step": 0 + }, + { + "ce_loss": 3.8592026233673096, + "epoch": 0, + "step": 0 + }, + { + "distill_loss": 1.896045446395874, + "epoch": 0, + "step": 0 + }, + { + "epoch": 0, + "ref_ce_loss": 3.0282931327819824, + "step": 0 + }, + { + "epoch": 0, + "loss": 20.778669357299805, + "step": 0 + }, + { + "ce_loss": 3.793614149093628, + "epoch": 0, + "step": 0 + }, + { + "distill_loss": 1.8912239074707031, + "epoch": 0, + "step": 0 + }, + { + "epoch": 0, + "ref_ce_loss": 2.9311015605926514, + "step": 0 + }, + { + "epoch": 0.00333555703802535, + "loss": 18.5052, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "grad_norm": 762.3750610351562, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "learning_rate": 4.166666666666666e-06, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "loss": 14.035091400146484, + "step": 10 + }, + { + "ce_loss": 3.1100096702575684, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "distill_loss": 1.7577776908874512, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "ref_ce_loss": 3.2110393047332764, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "loss": 13.886899948120117, + "step": 10 + }, + { + "ce_loss": 3.069828510284424, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "distill_loss": 1.7487568855285645, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "ref_ce_loss": 3.2038192749023438, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "loss": 12.481767654418945, + "step": 10 + }, + { + "ce_loss": 3.101062059402466, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "distill_loss": 1.7259626388549805, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "ref_ce_loss": 3.0520899295806885, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "loss": 13.172759056091309, + "step": 10 + }, + { + "ce_loss": 3.231196880340576, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "distill_loss": 1.8049229383468628, + "epoch": 0.00333555703802535, + "step": 10 + }, + { + "epoch": 0.00333555703802535, + "ref_ce_loss": 2.805325984954834, + "step": 10 + }, + { + "epoch": 0.0066711140760507, + "loss": 9.7871, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "grad_norm": 227.34100341796875, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "learning_rate": 8.333333333333332e-06, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "loss": 6.437663555145264, + "step": 20 + }, + { + "ce_loss": 1.5575122833251953, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "distill_loss": 1.659112572669983, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "ref_ce_loss": 3.047043800354004, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "loss": 6.436098575592041, + "step": 20 + }, + { + "ce_loss": 1.5137897729873657, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "distill_loss": 1.670945644378662, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "ref_ce_loss": 3.0977792739868164, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "loss": 6.919541835784912, + "step": 20 + }, + { + "ce_loss": 1.7808164358139038, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "distill_loss": 1.7737149000167847, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "ref_ce_loss": 2.914146900177002, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "loss": 7.18682861328125, + "step": 20 + }, + { + "ce_loss": 1.5944644212722778, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "distill_loss": 1.83235764503479, + "epoch": 0.0066711140760507, + "step": 20 + }, + { + "epoch": 0.0066711140760507, + "ref_ce_loss": 2.944377899169922, + "step": 20 + }, + { + "epoch": 0.01000667111407605, + "loss": 6.1788, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "grad_norm": 74.83038330078125, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "learning_rate": 1.2499999999999999e-05, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "loss": 5.411984920501709, + "step": 30 + }, + { + "ce_loss": 0.5704580545425415, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "distill_loss": 1.3909449577331543, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "ref_ce_loss": 2.3826277256011963, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "loss": 4.33409309387207, + "step": 30 + }, + { + "ce_loss": 0.5164247751235962, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "distill_loss": 1.3752120733261108, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "ref_ce_loss": 2.437467336654663, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "loss": 4.793662071228027, + "step": 30 + }, + { + "ce_loss": 0.4295244514942169, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "distill_loss": 1.3164716958999634, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "ref_ce_loss": 2.5118391513824463, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "loss": 6.409175872802734, + "step": 30 + }, + { + "ce_loss": 0.7220518589019775, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "distill_loss": 1.6983754634857178, + "epoch": 0.01000667111407605, + "step": 30 + }, + { + "epoch": 0.01000667111407605, + "ref_ce_loss": 2.3008272647857666, + "step": 30 + }, + { + "epoch": 0.0133422281521014, + "loss": 4.2478, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "grad_norm": 12.155257225036621, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "learning_rate": 1.6666666666666664e-05, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "loss": 3.65868878364563, + "step": 40 + }, + { + "ce_loss": 0.7195524573326111, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "distill_loss": 1.1870580911636353, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "ref_ce_loss": 1.3222304582595825, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "loss": 3.2196686267852783, + "step": 40 + }, + { + "ce_loss": 0.6631148457527161, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "distill_loss": 1.2712286710739136, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "ref_ce_loss": 1.283046841621399, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "loss": 3.057570457458496, + "step": 40 + }, + { + "ce_loss": 0.42720213532447815, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "distill_loss": 1.1877330541610718, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "ref_ce_loss": 1.2598295211791992, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "loss": 3.893627643585205, + "step": 40 + }, + { + "ce_loss": 0.5937362909317017, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "distill_loss": 1.1929004192352295, + "epoch": 0.0133422281521014, + "step": 40 + }, + { + "epoch": 0.0133422281521014, + "ref_ce_loss": 1.3428030014038086, + "step": 40 + }, + { + "epoch": 0.01667778519012675, + "loss": 3.0172, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "grad_norm": 11.977120399475098, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "learning_rate": 2.0833333333333333e-05, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "loss": 2.141342878341675, + "step": 50 + }, + { + "ce_loss": 0.47699522972106934, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "distill_loss": 0.9509934186935425, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "ref_ce_loss": 0.5504632592201233, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "loss": 2.638903856277466, + "step": 50 + }, + { + "ce_loss": 0.5003000497817993, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "distill_loss": 0.9422565698623657, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "ref_ce_loss": 0.5490682721138, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "loss": 2.7825381755828857, + "step": 50 + }, + { + "ce_loss": 0.49777916073799133, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "distill_loss": 1.050254225730896, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "ref_ce_loss": 0.4461013078689575, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "loss": 2.273829460144043, + "step": 50 + }, + { + "ce_loss": 0.5292913317680359, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "distill_loss": 1.0160512924194336, + "epoch": 0.01667778519012675, + "step": 50 + }, + { + "epoch": 0.01667778519012675, + "ref_ce_loss": 0.49012917280197144, + "step": 50 + }, + { + "epoch": 0.0200133422281521, + "loss": 2.2235, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "grad_norm": 8.11925983428955, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "learning_rate": 2.4999999999999998e-05, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "loss": 1.9671134948730469, + "step": 60 + }, + { + "ce_loss": 0.5590649247169495, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "distill_loss": 0.8475483059883118, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "ref_ce_loss": 0.3405017554759979, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "loss": 2.8774218559265137, + "step": 60 + }, + { + "ce_loss": 0.5681470036506653, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "distill_loss": 0.865785539150238, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "ref_ce_loss": 0.4073866009712219, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "loss": 1.493269681930542, + "step": 60 + }, + { + "ce_loss": 0.3890523612499237, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "distill_loss": 0.7810712456703186, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "ref_ce_loss": 0.322986900806427, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "loss": 2.767509937286377, + "step": 60 + }, + { + "ce_loss": 0.4819281995296478, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "distill_loss": 0.8184367418289185, + "epoch": 0.0200133422281521, + "step": 60 + }, + { + "epoch": 0.0200133422281521, + "ref_ce_loss": 0.2890125811100006, + "step": 60 + }, + { + "epoch": 0.02334889926617745, + "loss": 2.0951, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "grad_norm": 8.139989852905273, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "learning_rate": 2.9166666666666663e-05, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "loss": 1.4588215351104736, + "step": 70 + }, + { + "ce_loss": 0.4765501320362091, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "distill_loss": 0.7598084807395935, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "ref_ce_loss": 0.22233976423740387, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "loss": 2.102530002593994, + "step": 70 + }, + { + "ce_loss": 0.4892614185810089, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "distill_loss": 0.7742966413497925, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "ref_ce_loss": 0.2892897427082062, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "loss": 1.932617425918579, + "step": 70 + }, + { + "ce_loss": 0.5458316802978516, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "distill_loss": 0.8504544496536255, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "ref_ce_loss": 0.3213707506656647, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "loss": 1.812137246131897, + "step": 70 + }, + { + "ce_loss": 0.5298717021942139, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "distill_loss": 0.7876577377319336, + "epoch": 0.02334889926617745, + "step": 70 + }, + { + "epoch": 0.02334889926617745, + "ref_ce_loss": 0.29828882217407227, + "step": 70 + }, + { + "epoch": 0.0266844563042028, + "loss": 1.9725, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "grad_norm": 5.928866863250732, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "learning_rate": 3.333333333333333e-05, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "loss": 1.7023259401321411, + "step": 80 + }, + { + "ce_loss": 0.5784029364585876, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "distill_loss": 0.6333774328231812, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "ref_ce_loss": 0.3491305112838745, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "loss": 1.8371233940124512, + "step": 80 + }, + { + "ce_loss": 0.5097202658653259, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "distill_loss": 0.6676629781723022, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "ref_ce_loss": 0.24543851613998413, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "loss": 2.143604278564453, + "step": 80 + }, + { + "ce_loss": 0.6652351021766663, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "distill_loss": 0.7542927265167236, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "ref_ce_loss": 0.31177300214767456, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "loss": 2.2320713996887207, + "step": 80 + }, + { + "ce_loss": 0.47276201844215393, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "distill_loss": 0.6124601364135742, + "epoch": 0.0266844563042028, + "step": 80 + }, + { + "epoch": 0.0266844563042028, + "ref_ce_loss": 0.23781082034111023, + "step": 80 + }, + { + "epoch": 0.030020013342228154, + "loss": 1.9708, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "grad_norm": 9.603582382202148, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "learning_rate": 3.75e-05, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "loss": 2.0661861896514893, + "step": 90 + }, + { + "ce_loss": 0.5394359827041626, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "distill_loss": 0.6982392072677612, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "ref_ce_loss": 0.24196895956993103, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "loss": 2.078852653503418, + "step": 90 + }, + { + "ce_loss": 0.5509630441665649, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "distill_loss": 0.600722074508667, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "ref_ce_loss": 0.2933999001979828, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "loss": 1.5637493133544922, + "step": 90 + }, + { + "ce_loss": 0.46072623133659363, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "distill_loss": 0.588234543800354, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "ref_ce_loss": 0.2861691117286682, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "loss": 1.6592103242874146, + "step": 90 + }, + { + "ce_loss": 0.5185455679893494, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "distill_loss": 0.673694908618927, + "epoch": 0.030020013342228154, + "step": 90 + }, + { + "epoch": 0.030020013342228154, + "ref_ce_loss": 0.24703171849250793, + "step": 90 + }, + { + "epoch": 0.0333555703802535, + "loss": 1.8399, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "grad_norm": 7.311580657958984, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "learning_rate": 4.1666666666666665e-05, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "loss": 1.2659127712249756, + "step": 100 + }, + { + "ce_loss": 0.48525702953338623, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "distill_loss": 0.5368037819862366, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "ref_ce_loss": 0.24336490035057068, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "loss": 1.4343409538269043, + "step": 100 + }, + { + "ce_loss": 0.44810131192207336, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "distill_loss": 0.47137510776519775, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "ref_ce_loss": 0.3219914138317108, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "loss": 1.8354941606521606, + "step": 100 + }, + { + "ce_loss": 0.5599071383476257, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "distill_loss": 0.6153331995010376, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "ref_ce_loss": 0.22322463989257812, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "loss": 1.9053720235824585, + "step": 100 + }, + { + "ce_loss": 0.5626506805419922, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "distill_loss": 0.6455965042114258, + "epoch": 0.0333555703802535, + "step": 100 + }, + { + "epoch": 0.0333555703802535, + "ref_ce_loss": 0.2309160977602005, + "step": 100 + }, + { + "epoch": 0.03669112741827885, + "loss": 1.617, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "grad_norm": 3.8720383644104004, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "learning_rate": 4.5833333333333334e-05, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "loss": 1.8273422718048096, + "step": 110 + }, + { + "ce_loss": 0.4972834289073944, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "distill_loss": 0.5478036999702454, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "ref_ce_loss": 0.24270348250865936, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "loss": 1.5569121837615967, + "step": 110 + }, + { + "ce_loss": 0.47104451060295105, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "distill_loss": 0.5678485631942749, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "ref_ce_loss": 0.2063506692647934, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "loss": 1.791825771331787, + "step": 110 + }, + { + "ce_loss": 0.5053634643554688, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "distill_loss": 0.5453431606292725, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "ref_ce_loss": 0.2539502680301666, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "loss": 1.4480103254318237, + "step": 110 + }, + { + "ce_loss": 0.5998255014419556, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "distill_loss": 0.5709897875785828, + "epoch": 0.03669112741827885, + "step": 110 + }, + { + "epoch": 0.03669112741827885, + "ref_ce_loss": 0.2771782875061035, + "step": 110 + }, + { + "epoch": 0.0400266844563042, + "loss": 1.7635, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "grad_norm": 11.676295280456543, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "learning_rate": 4.9999999999999996e-05, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "loss": 1.4083495140075684, + "step": 120 + }, + { + "ce_loss": 0.5004300475120544, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "distill_loss": 0.4932730197906494, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "ref_ce_loss": 0.23548881709575653, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "loss": 1.7517530918121338, + "step": 120 + }, + { + "ce_loss": 0.5206258893013, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "distill_loss": 0.5133766531944275, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "ref_ce_loss": 0.30411529541015625, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "loss": 2.425381898880005, + "step": 120 + }, + { + "ce_loss": 0.5057188272476196, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "distill_loss": 0.45857954025268555, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "ref_ce_loss": 0.24030372500419617, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "loss": 1.2033119201660156, + "step": 120 + }, + { + "ce_loss": 0.4649990200996399, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "distill_loss": 0.43947190046310425, + "epoch": 0.0400266844563042, + "step": 120 + }, + { + "epoch": 0.0400266844563042, + "ref_ce_loss": 0.29883599281311035, + "step": 120 + }, + { + "epoch": 0.04336224149432955, + "loss": 1.6532, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "grad_norm": 6.527844429016113, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "learning_rate": 5.4166666666666664e-05, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "loss": 1.7000669240951538, + "step": 130 + }, + { + "ce_loss": 0.46197932958602905, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "distill_loss": 0.46860989928245544, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "ref_ce_loss": 0.18592247366905212, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "loss": 1.247503638267517, + "step": 130 + }, + { + "ce_loss": 0.5011730194091797, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "distill_loss": 0.5982078313827515, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "ref_ce_loss": 0.1478995680809021, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "loss": 1.0552470684051514, + "step": 130 + }, + { + "ce_loss": 0.44217559695243835, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "distill_loss": 0.41241592168807983, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "ref_ce_loss": 0.20050190389156342, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "loss": 1.273942470550537, + "step": 130 + }, + { + "ce_loss": 0.44072940945625305, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "distill_loss": 0.3735302686691284, + "epoch": 0.04336224149432955, + "step": 130 + }, + { + "epoch": 0.04336224149432955, + "ref_ce_loss": 0.28775811195373535, + "step": 130 + }, + { + "epoch": 0.0466977985323549, + "loss": 1.6767, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "grad_norm": 14.396843910217285, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "learning_rate": 5.8333333333333326e-05, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "loss": 3.0713770389556885, + "step": 140 + }, + { + "ce_loss": 0.4275344908237457, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "distill_loss": 0.5006403923034668, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "ref_ce_loss": 0.26904621720314026, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "loss": 2.02996826171875, + "step": 140 + }, + { + "ce_loss": 0.5514727234840393, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "distill_loss": 0.5769180655479431, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "ref_ce_loss": 0.1895466148853302, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "loss": 2.803758382797241, + "step": 140 + }, + { + "ce_loss": 0.6055456399917603, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "distill_loss": 0.624858558177948, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "ref_ce_loss": 0.26189401745796204, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "loss": 1.83541738986969, + "step": 140 + }, + { + "ce_loss": 0.5861349105834961, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "distill_loss": 0.6315572261810303, + "epoch": 0.0466977985323549, + "step": 140 + }, + { + "epoch": 0.0466977985323549, + "ref_ce_loss": 0.24624021351337433, + "step": 140 + }, + { + "epoch": 0.05003335557038025, + "loss": 1.8934, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "grad_norm": 3.9040794372558594, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "learning_rate": 6.25e-05, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "loss": 2.2169854640960693, + "step": 150 + }, + { + "ce_loss": 0.4155034124851227, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "distill_loss": 0.5503289103507996, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "ref_ce_loss": 0.1302298605442047, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "loss": 2.067556858062744, + "step": 150 + }, + { + "ce_loss": 0.5323479771614075, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "distill_loss": 0.5300868153572083, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "ref_ce_loss": 0.25406020879745483, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "loss": 1.628650426864624, + "step": 150 + }, + { + "ce_loss": 0.5438556671142578, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "distill_loss": 0.5136424899101257, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "ref_ce_loss": 0.25217124819755554, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "loss": 1.442086100578308, + "step": 150 + }, + { + "ce_loss": 0.47215816378593445, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "distill_loss": 0.47653883695602417, + "epoch": 0.05003335557038025, + "step": 150 + }, + { + "epoch": 0.05003335557038025, + "ref_ce_loss": 0.24462643265724182, + "step": 150 + }, + { + "epoch": 0.0533689126084056, + "loss": 1.8171, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "grad_norm": 6.843990325927734, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "learning_rate": 6.666666666666666e-05, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "loss": 1.6495225429534912, + "step": 160 + }, + { + "ce_loss": 0.5015798807144165, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "distill_loss": 0.5507673025131226, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "ref_ce_loss": 0.21071067452430725, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "loss": 1.8539624214172363, + "step": 160 + }, + { + "ce_loss": 0.5268104076385498, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "distill_loss": 0.582175612449646, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "ref_ce_loss": 0.1892932802438736, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "loss": 1.9361590147018433, + "step": 160 + }, + { + "ce_loss": 0.5014770030975342, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "distill_loss": 0.5757460594177246, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "ref_ce_loss": 0.27097243070602417, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "loss": 2.1110458374023438, + "step": 160 + }, + { + "ce_loss": 0.39067360758781433, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "distill_loss": 0.4890226125717163, + "epoch": 0.0533689126084056, + "step": 160 + }, + { + "epoch": 0.0533689126084056, + "ref_ce_loss": 0.16871194541454315, + "step": 160 + }, + { + "epoch": 0.05670446964643095, + "loss": 1.7352, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "grad_norm": 11.573543548583984, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "learning_rate": 7.083333333333332e-05, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "loss": 1.8267295360565186, + "step": 170 + }, + { + "ce_loss": 0.5845050811767578, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "distill_loss": 0.5946557521820068, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "ref_ce_loss": 0.3130984604358673, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "loss": 1.8540599346160889, + "step": 170 + }, + { + "ce_loss": 0.44277799129486084, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "distill_loss": 0.5504587888717651, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "ref_ce_loss": 0.18251009285449982, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "loss": 1.7149605751037598, + "step": 170 + }, + { + "ce_loss": 0.4688924551010132, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "distill_loss": 0.446077436208725, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "ref_ce_loss": 0.21932320296764374, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "loss": 1.5233681201934814, + "step": 170 + }, + { + "ce_loss": 0.4129520654678345, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "distill_loss": 0.5524225234985352, + "epoch": 0.05670446964643095, + "step": 170 + }, + { + "epoch": 0.05670446964643095, + "ref_ce_loss": 0.17084111273288727, + "step": 170 + }, + { + "epoch": 0.06004002668445631, + "loss": 1.5744, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "grad_norm": 7.211359977722168, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "learning_rate": 7.5e-05, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "loss": 1.1495260000228882, + "step": 180 + }, + { + "ce_loss": 0.4412047863006592, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "distill_loss": 0.5178871750831604, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "ref_ce_loss": 0.19014258682727814, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "loss": 1.5029988288879395, + "step": 180 + }, + { + "ce_loss": 0.5065923929214478, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "distill_loss": 0.5425624251365662, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "ref_ce_loss": 0.2572532892227173, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "loss": 1.4675511121749878, + "step": 180 + }, + { + "ce_loss": 0.44596901535987854, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "distill_loss": 0.4775707423686981, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "ref_ce_loss": 0.22245991230010986, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "loss": 2.0268988609313965, + "step": 180 + }, + { + "ce_loss": 0.5191164016723633, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "distill_loss": 0.6096824407577515, + "epoch": 0.06004002668445631, + "step": 180 + }, + { + "epoch": 0.06004002668445631, + "ref_ce_loss": 0.24708403646945953, + "step": 180 + }, + { + "epoch": 0.06337558372248166, + "loss": 1.6854, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "grad_norm": 5.338657379150391, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "learning_rate": 7.916666666666666e-05, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "loss": 2.0079145431518555, + "step": 190 + }, + { + "ce_loss": 0.45281872153282166, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "distill_loss": 0.585824191570282, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "ref_ce_loss": 0.18356651067733765, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "loss": 2.0530364513397217, + "step": 190 + }, + { + "ce_loss": 0.5388656854629517, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "distill_loss": 0.5773449540138245, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "ref_ce_loss": 0.23279789090156555, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "loss": 1.787460207939148, + "step": 190 + }, + { + "ce_loss": 0.5597324371337891, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "distill_loss": 0.6426137685775757, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "ref_ce_loss": 0.1847495138645172, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "loss": 2.38955020904541, + "step": 190 + }, + { + "ce_loss": 0.542634129524231, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "distill_loss": 0.6852626204490662, + "epoch": 0.06337558372248166, + "step": 190 + }, + { + "epoch": 0.06337558372248166, + "ref_ce_loss": 0.22592175006866455, + "step": 190 + }, + { + "epoch": 0.066711140760507, + "loss": 1.7286, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "grad_norm": 4.373966693878174, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "learning_rate": 8.333333333333333e-05, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "loss": 1.9177393913269043, + "step": 200 + }, + { + "ce_loss": 0.5175270438194275, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "distill_loss": 0.5832851529121399, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "ref_ce_loss": 0.203318253159523, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "loss": 1.9105541706085205, + "step": 200 + }, + { + "ce_loss": 0.4564187526702881, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "distill_loss": 0.6109424829483032, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "ref_ce_loss": 0.22662192583084106, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "loss": 2.0437614917755127, + "step": 200 + }, + { + "ce_loss": 0.5581548810005188, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "distill_loss": 0.660732090473175, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "ref_ce_loss": 0.2035897672176361, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "loss": 1.8254846334457397, + "step": 200 + }, + { + "ce_loss": 0.49772366881370544, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "distill_loss": 0.6139116287231445, + "epoch": 0.066711140760507, + "step": 200 + }, + { + "epoch": 0.066711140760507, + "ref_ce_loss": 0.14511890709400177, + "step": 200 + }, + { + "epoch": 0.07004669779853236, + "loss": 1.7987, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "grad_norm": 8.68053150177002, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "learning_rate": 8.75e-05, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "loss": 1.711827039718628, + "step": 210 + }, + { + "ce_loss": 0.4407254457473755, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "distill_loss": 0.6045986413955688, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "ref_ce_loss": 0.1859685778617859, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "loss": 1.258393406867981, + "step": 210 + }, + { + "ce_loss": 0.396217405796051, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "distill_loss": 0.53984534740448, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "ref_ce_loss": 0.15166202187538147, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "loss": 1.666795015335083, + "step": 210 + }, + { + "ce_loss": 0.4788994789123535, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "distill_loss": 0.6096857190132141, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "ref_ce_loss": 0.20014192163944244, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "loss": 1.3152960538864136, + "step": 210 + }, + { + "ce_loss": 0.4885876178741455, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "distill_loss": 0.5094537734985352, + "epoch": 0.07004669779853236, + "step": 210 + }, + { + "epoch": 0.07004669779853236, + "ref_ce_loss": 0.3172496259212494, + "step": 210 + }, + { + "epoch": 0.0733822548365577, + "loss": 1.6134, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "grad_norm": 7.969250679016113, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "learning_rate": 9.166666666666667e-05, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "loss": 1.9584766626358032, + "step": 220 + }, + { + "ce_loss": 0.5503972768783569, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "distill_loss": 0.846336841583252, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "ref_ce_loss": 0.272134006023407, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "loss": 3.0287160873413086, + "step": 220 + }, + { + "ce_loss": 0.48527926206588745, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "distill_loss": 0.7273819446563721, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "ref_ce_loss": 0.24912559986114502, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "loss": 2.417612314224243, + "step": 220 + }, + { + "ce_loss": 0.49760445952415466, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "distill_loss": 0.7104735970497131, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "ref_ce_loss": 0.20668411254882812, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "loss": 1.9464311599731445, + "step": 220 + }, + { + "ce_loss": 0.38983479142189026, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "distill_loss": 0.5419527888298035, + "epoch": 0.0733822548365577, + "step": 220 + }, + { + "epoch": 0.0733822548365577, + "ref_ce_loss": 0.20586061477661133, + "step": 220 + }, + { + "epoch": 0.07671781187458306, + "loss": 1.8046, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "grad_norm": 4.189472198486328, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "learning_rate": 9.583333333333332e-05, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "loss": 1.4793825149536133, + "step": 230 + }, + { + "ce_loss": 0.5580512285232544, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "distill_loss": 0.6622768044471741, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "ref_ce_loss": 0.2589597702026367, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "loss": 1.6697819232940674, + "step": 230 + }, + { + "ce_loss": 0.4094105064868927, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "distill_loss": 0.6387210488319397, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "ref_ce_loss": 0.17742785811424255, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "loss": 2.397416830062866, + "step": 230 + }, + { + "ce_loss": 0.5211621522903442, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "distill_loss": 0.6582202911376953, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "ref_ce_loss": 0.22356149554252625, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "loss": 1.714322805404663, + "step": 230 + }, + { + "ce_loss": 0.5506291389465332, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "distill_loss": 0.7077121138572693, + "epoch": 0.07671781187458306, + "step": 230 + }, + { + "epoch": 0.07671781187458306, + "ref_ce_loss": 0.22887969017028809, + "step": 230 + }, + { + "epoch": 0.0800533689126084, + "loss": 1.6997, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "grad_norm": 4.067202568054199, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "learning_rate": 9.999999999999999e-05, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "loss": 1.3863872289657593, + "step": 240 + }, + { + "ce_loss": 0.40065205097198486, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "distill_loss": 0.5652359127998352, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "ref_ce_loss": 0.17888511717319489, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "loss": 1.5082571506500244, + "step": 240 + }, + { + "ce_loss": 0.4735325872898102, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "distill_loss": 0.6025104522705078, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "ref_ce_loss": 0.20674863457679749, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "loss": 1.3987960815429688, + "step": 240 + }, + { + "ce_loss": 0.3731807470321655, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "distill_loss": 0.5945234894752502, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "ref_ce_loss": 0.1682978868484497, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "loss": 1.8205552101135254, + "step": 240 + }, + { + "ce_loss": 0.3344307839870453, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "distill_loss": 0.6057762503623962, + "epoch": 0.0800533689126084, + "step": 240 + }, + { + "epoch": 0.0800533689126084, + "ref_ce_loss": 0.1428370624780655, + "step": 240 + }, + { + "epoch": 0.08338892595063375, + "loss": 1.702, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "grad_norm": 3.4190220832824707, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "learning_rate": 0.00010416666666666666, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "loss": 2.404367446899414, + "step": 250 + }, + { + "ce_loss": 0.5212981700897217, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "distill_loss": 0.6047353148460388, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "ref_ce_loss": 0.18801236152648926, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "loss": 1.6255981922149658, + "step": 250 + }, + { + "ce_loss": 0.3867962658405304, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "distill_loss": 0.5201380848884583, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "ref_ce_loss": 0.19661235809326172, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "loss": 1.7828060388565063, + "step": 250 + }, + { + "ce_loss": 0.4401053786277771, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "distill_loss": 0.5746744871139526, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "ref_ce_loss": 0.17614340782165527, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "loss": 1.0926663875579834, + "step": 250 + }, + { + "ce_loss": 0.363411545753479, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "distill_loss": 0.44517719745635986, + "epoch": 0.08338892595063375, + "step": 250 + }, + { + "epoch": 0.08338892595063375, + "ref_ce_loss": 0.27430668473243713, + "step": 250 + }, + { + "epoch": 0.0867244829886591, + "loss": 1.5861, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "grad_norm": 4.730551719665527, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "learning_rate": 0.00010833333333333333, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "loss": 2.1394453048706055, + "step": 260 + }, + { + "ce_loss": 0.48058784008026123, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "distill_loss": 0.5192797780036926, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "ref_ce_loss": 0.1934836208820343, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "loss": 1.4074976444244385, + "step": 260 + }, + { + "ce_loss": 0.4552706182003021, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "distill_loss": 0.48851698637008667, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "ref_ce_loss": 0.15941733121871948, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "loss": 1.5271190404891968, + "step": 260 + }, + { + "ce_loss": 0.4061850905418396, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "distill_loss": 0.49891141057014465, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "ref_ce_loss": 0.23973435163497925, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "loss": 1.4721564054489136, + "step": 260 + }, + { + "ce_loss": 0.4186480939388275, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "distill_loss": 0.46537312865257263, + "epoch": 0.0867244829886591, + "step": 260 + }, + { + "epoch": 0.0867244829886591, + "ref_ce_loss": 0.22249318659305573, + "step": 260 + }, + { + "epoch": 0.09006004002668445, + "loss": 1.598, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "grad_norm": 6.989984035491943, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "learning_rate": 0.0001125, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "loss": 1.7460898160934448, + "step": 270 + }, + { + "ce_loss": 0.5612281560897827, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "distill_loss": 0.7256549596786499, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "ref_ce_loss": 0.15846964716911316, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "loss": 1.6589117050170898, + "step": 270 + }, + { + "ce_loss": 0.3386031985282898, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "distill_loss": 0.5948140621185303, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "ref_ce_loss": 0.1583528220653534, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "loss": 1.8619314432144165, + "step": 270 + }, + { + "ce_loss": 0.4433027505874634, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "distill_loss": 0.6727502346038818, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "ref_ce_loss": 0.19358614087104797, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "loss": 1.4343819618225098, + "step": 270 + }, + { + "ce_loss": 0.31405356526374817, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "distill_loss": 0.5503586530685425, + "epoch": 0.09006004002668445, + "step": 270 + }, + { + "epoch": 0.09006004002668445, + "ref_ce_loss": 0.16053526103496552, + "step": 270 + }, + { + "epoch": 0.0933955970647098, + "loss": 1.5236, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "grad_norm": 2.97304368019104, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "learning_rate": 0.00011666666666666665, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "loss": 1.3160141706466675, + "step": 280 + }, + { + "ce_loss": 0.4134749174118042, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "distill_loss": 0.5966621041297913, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "ref_ce_loss": 0.13658492267131805, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "loss": 1.6723679304122925, + "step": 280 + }, + { + "ce_loss": 0.5664156675338745, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "distill_loss": 0.6793075203895569, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "ref_ce_loss": 0.2132052481174469, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "loss": 1.7364765405654907, + "step": 280 + }, + { + "ce_loss": 0.3761884570121765, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "distill_loss": 0.6366319060325623, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "ref_ce_loss": 0.19218598306179047, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "loss": 1.455072283744812, + "step": 280 + }, + { + "ce_loss": 0.402457594871521, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "distill_loss": 0.6173094511032104, + "epoch": 0.0933955970647098, + "step": 280 + }, + { + "epoch": 0.0933955970647098, + "ref_ce_loss": 0.18485473096370697, + "step": 280 + }, + { + "epoch": 0.09673115410273515, + "loss": 1.5508, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "grad_norm": 6.759135723114014, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "learning_rate": 0.00012083333333333332, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "loss": 2.49653959274292, + "step": 290 + }, + { + "ce_loss": 0.5873475074768066, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "distill_loss": 0.47205284237861633, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "ref_ce_loss": 0.2220914363861084, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "loss": 1.023373007774353, + "step": 290 + }, + { + "ce_loss": 0.42132118344306946, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "distill_loss": 0.48384732007980347, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "ref_ce_loss": 0.11812644451856613, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "loss": 1.0123854875564575, + "step": 290 + }, + { + "ce_loss": 0.32844048738479614, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "distill_loss": 0.3588325083255768, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "ref_ce_loss": 0.18135057389736176, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "loss": 1.6200289726257324, + "step": 290 + }, + { + "ce_loss": 0.4336802065372467, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "distill_loss": 0.517190158367157, + "epoch": 0.09673115410273515, + "step": 290 + }, + { + "epoch": 0.09673115410273515, + "ref_ce_loss": 0.18303078413009644, + "step": 290 + }, + { + "epoch": 0.1000667111407605, + "loss": 1.6154, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "grad_norm": 4.066988468170166, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "learning_rate": 0.000125, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "loss": 1.648413062095642, + "step": 300 + }, + { + "ce_loss": 0.3631472587585449, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "distill_loss": 0.32697027921676636, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "ref_ce_loss": 0.2214168757200241, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "loss": 1.8053052425384521, + "step": 300 + }, + { + "ce_loss": 0.47860148549079895, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "distill_loss": 0.43630748987197876, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "ref_ce_loss": 0.20420612394809723, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "loss": 0.9513359665870667, + "step": 300 + }, + { + "ce_loss": 0.39622437953948975, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "distill_loss": 0.365519642829895, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "ref_ce_loss": 0.18928639590740204, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "loss": 1.047836184501648, + "step": 300 + }, + { + "ce_loss": 0.44009682536125183, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "distill_loss": 0.3900669515132904, + "epoch": 0.1000667111407605, + "step": 300 + }, + { + "epoch": 0.1000667111407605, + "ref_ce_loss": 0.2174147367477417, + "step": 300 + }, + { + "epoch": 0.10340226817878585, + "loss": 1.5139, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "grad_norm": 2.1779675483703613, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "learning_rate": 0.00012916666666666667, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "loss": 1.72920823097229, + "step": 310 + }, + { + "ce_loss": 0.40507739782333374, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "distill_loss": 0.4770023822784424, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "ref_ce_loss": 0.19400697946548462, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "loss": 1.1937280893325806, + "step": 310 + }, + { + "ce_loss": 0.3553615212440491, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "distill_loss": 0.4500002861022949, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "ref_ce_loss": 0.1656891256570816, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "loss": 1.708717703819275, + "step": 310 + }, + { + "ce_loss": 0.36175283789634705, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "distill_loss": 0.5048600435256958, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "ref_ce_loss": 0.13430316746234894, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "loss": 1.4646344184875488, + "step": 310 + }, + { + "ce_loss": 0.535317599773407, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "distill_loss": 0.4663164019584656, + "epoch": 0.10340226817878585, + "step": 310 + }, + { + "epoch": 0.10340226817878585, + "ref_ce_loss": 0.2614893615245819, + "step": 310 + }, + { + "epoch": 0.1067378252168112, + "loss": 1.5879, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "grad_norm": 3.444401979446411, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "learning_rate": 0.0001333333333333333, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "loss": 1.3173494338989258, + "step": 320 + }, + { + "ce_loss": 0.4536479711532593, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "distill_loss": 0.38794639706611633, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "ref_ce_loss": 0.20736144483089447, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "loss": 1.2465583086013794, + "step": 320 + }, + { + "ce_loss": 0.4199206531047821, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "distill_loss": 0.4158381223678589, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "ref_ce_loss": 0.15094277262687683, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "loss": 1.271466612815857, + "step": 320 + }, + { + "ce_loss": 0.3549860119819641, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "distill_loss": 0.2894030511379242, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "ref_ce_loss": 0.24361640214920044, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "loss": 1.6522722244262695, + "step": 320 + }, + { + "ce_loss": 0.41289564967155457, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "distill_loss": 0.4370846152305603, + "epoch": 0.1067378252168112, + "step": 320 + }, + { + "epoch": 0.1067378252168112, + "ref_ce_loss": 0.222516730427742, + "step": 320 + }, + { + "epoch": 0.11007338225483655, + "loss": 1.4188, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "grad_norm": 4.895388126373291, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "learning_rate": 0.00013749999999999998, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "loss": 1.594529628753662, + "step": 330 + }, + { + "ce_loss": 0.378388375043869, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "distill_loss": 0.32350635528564453, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "ref_ce_loss": 0.19054223597049713, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "loss": 1.8214046955108643, + "step": 330 + }, + { + "ce_loss": 0.4791013300418854, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "distill_loss": 0.3267383873462677, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "ref_ce_loss": 0.23376412689685822, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "loss": 0.8836880326271057, + "step": 330 + }, + { + "ce_loss": 0.3631823658943176, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "distill_loss": 0.3676852881908417, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "ref_ce_loss": 0.1527552306652069, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "loss": 1.3370083570480347, + "step": 330 + }, + { + "ce_loss": 0.4256639778614044, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "distill_loss": 0.41985267400741577, + "epoch": 0.11007338225483655, + "step": 330 + }, + { + "epoch": 0.11007338225483655, + "ref_ce_loss": 0.1425405740737915, + "step": 330 + }, + { + "epoch": 0.1134089392928619, + "loss": 1.4184, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "grad_norm": 6.696072578430176, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "learning_rate": 0.00014166666666666665, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "loss": 1.4294278621673584, + "step": 340 + }, + { + "ce_loss": 0.4337483048439026, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "distill_loss": 0.42644885182380676, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "ref_ce_loss": 0.14427414536476135, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "loss": 1.463660717010498, + "step": 340 + }, + { + "ce_loss": 0.5838468074798584, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "distill_loss": 0.46177640557289124, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "ref_ce_loss": 0.2553325593471527, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "loss": 1.427936315536499, + "step": 340 + }, + { + "ce_loss": 0.48335981369018555, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "distill_loss": 0.4261768162250519, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "ref_ce_loss": 0.24112361669540405, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "loss": 1.4521446228027344, + "step": 340 + }, + { + "ce_loss": 0.4573907256126404, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "distill_loss": 0.39685431122779846, + "epoch": 0.1134089392928619, + "step": 340 + }, + { + "epoch": 0.1134089392928619, + "ref_ce_loss": 0.16895025968551636, + "step": 340 + }, + { + "epoch": 0.11674449633088725, + "loss": 1.4988, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "grad_norm": 4.563868045806885, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "learning_rate": 0.00014583333333333332, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "loss": 1.0791542530059814, + "step": 350 + }, + { + "ce_loss": 0.35604622960090637, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "distill_loss": 0.3385554552078247, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "ref_ce_loss": 0.1298314332962036, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "loss": 1.874572515487671, + "step": 350 + }, + { + "ce_loss": 0.5706193447113037, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "distill_loss": 0.462056040763855, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "ref_ce_loss": 0.18519967794418335, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "loss": 1.3227744102478027, + "step": 350 + }, + { + "ce_loss": 0.4325740933418274, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "distill_loss": 0.4918447732925415, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "ref_ce_loss": 0.18212299048900604, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "loss": 1.2638686895370483, + "step": 350 + }, + { + "ce_loss": 0.4000357687473297, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "distill_loss": 0.38934990763664246, + "epoch": 0.11674449633088725, + "step": 350 + }, + { + "epoch": 0.11674449633088725, + "ref_ce_loss": 0.16199086606502533, + "step": 350 + }, + { + "epoch": 0.12008005336891261, + "loss": 1.4421, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "grad_norm": 3.7695729732513428, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "learning_rate": 0.00015, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "loss": 1.2791365385055542, + "step": 360 + }, + { + "ce_loss": 0.4072173535823822, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "distill_loss": 0.5058611631393433, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "ref_ce_loss": 0.17743639647960663, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "loss": 1.5815454721450806, + "step": 360 + }, + { + "ce_loss": 0.48840242624282837, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "distill_loss": 0.546277642250061, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "ref_ce_loss": 0.26060250401496887, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "loss": 1.5207912921905518, + "step": 360 + }, + { + "ce_loss": 0.352418452501297, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "distill_loss": 0.5164685845375061, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "ref_ce_loss": 0.1214674562215805, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "loss": 1.307206153869629, + "step": 360 + }, + { + "ce_loss": 0.3980710804462433, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "distill_loss": 0.5026689171791077, + "epoch": 0.12008005336891261, + "step": 360 + }, + { + "epoch": 0.12008005336891261, + "ref_ce_loss": 0.1789749711751938, + "step": 360 + }, + { + "epoch": 0.12341561040693796, + "loss": 1.2719, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "grad_norm": 2.6082639694213867, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "learning_rate": 0.00015416666666666663, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "loss": 1.9527366161346436, + "step": 370 + }, + { + "ce_loss": 0.4247026741504669, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "distill_loss": 0.44470375776290894, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "ref_ce_loss": 0.23504160344600677, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "loss": 1.3341894149780273, + "step": 370 + }, + { + "ce_loss": 0.47517120838165283, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "distill_loss": 0.4574129581451416, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "ref_ce_loss": 0.24176302552223206, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "loss": 1.0476963520050049, + "step": 370 + }, + { + "ce_loss": 0.32721778750419617, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "distill_loss": 0.44194135069847107, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "ref_ce_loss": 0.10548078268766403, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "loss": 1.5318841934204102, + "step": 370 + }, + { + "ce_loss": 0.38235610723495483, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "distill_loss": 0.4024381935596466, + "epoch": 0.12341561040693796, + "step": 370 + }, + { + "epoch": 0.12341561040693796, + "ref_ce_loss": 0.2406122088432312, + "step": 370 + }, + { + "epoch": 0.12675116744496331, + "loss": 1.3198, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "grad_norm": 3.5721678733825684, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "learning_rate": 0.00015833333333333332, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "loss": 1.3019144535064697, + "step": 380 + }, + { + "ce_loss": 0.47807297110557556, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "distill_loss": 0.3548854887485504, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "ref_ce_loss": 0.2511584162712097, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "loss": 0.7955793142318726, + "step": 380 + }, + { + "ce_loss": 0.31110042333602905, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "distill_loss": 0.2900923788547516, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "ref_ce_loss": 0.19429923593997955, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "loss": 1.0563925504684448, + "step": 380 + }, + { + "ce_loss": 0.4805901348590851, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "distill_loss": 0.3224341571331024, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "ref_ce_loss": 0.25321510434150696, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "loss": 1.2074453830718994, + "step": 380 + }, + { + "ce_loss": 0.39232152700424194, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "distill_loss": 0.3258730173110962, + "epoch": 0.12675116744496331, + "step": 380 + }, + { + "epoch": 0.12675116744496331, + "ref_ce_loss": 0.2012324184179306, + "step": 380 + }, + { + "epoch": 0.13008672448298866, + "loss": 1.256, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "grad_norm": 5.881503105163574, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "learning_rate": 0.00016249999999999997, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "loss": 1.3629982471466064, + "step": 390 + }, + { + "ce_loss": 0.36652955412864685, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "distill_loss": 0.29035764932632446, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "ref_ce_loss": 0.22455023229122162, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "loss": 0.9247293472290039, + "step": 390 + }, + { + "ce_loss": 0.4429352283477783, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "distill_loss": 0.29146555066108704, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "ref_ce_loss": 0.19032831490039825, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "loss": 1.3504780530929565, + "step": 390 + }, + { + "ce_loss": 0.3869108557701111, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "distill_loss": 0.2632961869239807, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "ref_ce_loss": 0.1716235727071762, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "loss": 0.904478907585144, + "step": 390 + }, + { + "ce_loss": 0.3317762017250061, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "distill_loss": 0.2193835973739624, + "epoch": 0.13008672448298866, + "step": 390 + }, + { + "epoch": 0.13008672448298866, + "ref_ce_loss": 0.1852221041917801, + "step": 390 + }, + { + "epoch": 0.133422281521014, + "loss": 1.3014, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "grad_norm": 4.005307674407959, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "learning_rate": 0.00016666666666666666, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "loss": 1.7505016326904297, + "step": 400 + }, + { + "ce_loss": 0.44826963543891907, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "distill_loss": 0.23693206906318665, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "ref_ce_loss": 0.26399722695350647, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "loss": 0.8870535492897034, + "step": 400 + }, + { + "ce_loss": 0.3827820122241974, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "distill_loss": 0.20150505006313324, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "ref_ce_loss": 0.15887323021888733, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "loss": 1.160254955291748, + "step": 400 + }, + { + "ce_loss": 0.37770774960517883, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "distill_loss": 0.24424059689044952, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "ref_ce_loss": 0.2287183552980423, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "loss": 1.241231083869934, + "step": 400 + }, + { + "ce_loss": 0.4069371819496155, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "distill_loss": 0.22539016604423523, + "epoch": 0.133422281521014, + "step": 400 + }, + { + "epoch": 0.133422281521014, + "ref_ce_loss": 0.21300897002220154, + "step": 400 + }, + { + "epoch": 0.13675783855903936, + "loss": 1.3101, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "grad_norm": 5.411527156829834, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "learning_rate": 0.0001708333333333333, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "loss": 1.7810142040252686, + "step": 410 + }, + { + "ce_loss": 0.45244982838630676, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "distill_loss": 0.2896427810192108, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "ref_ce_loss": 0.1715669482946396, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "loss": 1.1003687381744385, + "step": 410 + }, + { + "ce_loss": 0.3501242399215698, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "distill_loss": 0.26800230145454407, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "ref_ce_loss": 0.14147433638572693, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "loss": 1.1758261919021606, + "step": 410 + }, + { + "ce_loss": 0.4695630371570587, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "distill_loss": 0.3363664746284485, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "ref_ce_loss": 0.1658724695444107, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "loss": 1.3013876676559448, + "step": 410 + }, + { + "ce_loss": 0.4676929712295532, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "distill_loss": 0.2978712022304535, + "epoch": 0.13675783855903936, + "step": 410 + }, + { + "epoch": 0.13675783855903936, + "ref_ce_loss": 0.18323028087615967, + "step": 410 + }, + { + "epoch": 0.1400933955970647, + "loss": 1.1958, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "grad_norm": 4.778338432312012, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "learning_rate": 0.000175, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "loss": 0.9920318722724915, + "step": 420 + }, + { + "ce_loss": 0.5251628160476685, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "distill_loss": 0.2395399510860443, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "ref_ce_loss": 0.22730065882205963, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "loss": 1.2365477085113525, + "step": 420 + }, + { + "ce_loss": 0.4431440532207489, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "distill_loss": 0.2159503847360611, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "ref_ce_loss": 0.24663586914539337, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "loss": 0.8285670876502991, + "step": 420 + }, + { + "ce_loss": 0.42659738659858704, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "distill_loss": 0.20237162709236145, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "ref_ce_loss": 0.19959665834903717, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "loss": 1.406829833984375, + "step": 420 + }, + { + "ce_loss": 0.3699018955230713, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "distill_loss": 0.19302500784397125, + "epoch": 0.1400933955970647, + "step": 420 + }, + { + "epoch": 0.1400933955970647, + "ref_ce_loss": 0.21495334804058075, + "step": 420 + }, + { + "epoch": 0.14342895263509006, + "loss": 1.2285, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "grad_norm": 8.007396697998047, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "learning_rate": 0.00017916666666666664, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "loss": 1.0800083875656128, + "step": 430 + }, + { + "ce_loss": 0.34309080243110657, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "distill_loss": 0.17490103840827942, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "ref_ce_loss": 0.1360379010438919, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "loss": 1.1706424951553345, + "step": 430 + }, + { + "ce_loss": 0.4648958742618561, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "distill_loss": 0.21292121708393097, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "ref_ce_loss": 0.23670294880867004, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "loss": 1.0521199703216553, + "step": 430 + }, + { + "ce_loss": 0.4007576107978821, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "distill_loss": 0.18228591978549957, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "ref_ce_loss": 0.2405649870634079, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "loss": 0.9265434741973877, + "step": 430 + }, + { + "ce_loss": 0.3507271409034729, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "distill_loss": 0.1507226973772049, + "epoch": 0.14342895263509006, + "step": 430 + }, + { + "epoch": 0.14342895263509006, + "ref_ce_loss": 0.15023833513259888, + "step": 430 + }, + { + "epoch": 0.1467645096731154, + "loss": 1.1471, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "grad_norm": 4.192609786987305, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "learning_rate": 0.00018333333333333334, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "loss": 0.79221510887146, + "step": 440 + }, + { + "ce_loss": 0.3981459140777588, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "distill_loss": 0.22157087922096252, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "ref_ce_loss": 0.17235179245471954, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "loss": 0.863082230091095, + "step": 440 + }, + { + "ce_loss": 0.36590737104415894, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "distill_loss": 0.1941511183977127, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "ref_ce_loss": 0.17397010326385498, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "loss": 0.7825886011123657, + "step": 440 + }, + { + "ce_loss": 0.3600039780139923, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "distill_loss": 0.1648310422897339, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "ref_ce_loss": 0.2572781443595886, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "loss": 1.2959476709365845, + "step": 440 + }, + { + "ce_loss": 0.42079582810401917, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "distill_loss": 0.18587611615657806, + "epoch": 0.1467645096731154, + "step": 440 + }, + { + "epoch": 0.1467645096731154, + "ref_ce_loss": 0.23832887411117554, + "step": 440 + }, + { + "epoch": 0.15010006671114076, + "loss": 1.1871, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "grad_norm": 3.1583211421966553, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "learning_rate": 0.00018749999999999998, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "loss": 1.155402660369873, + "step": 450 + }, + { + "ce_loss": 0.3351343870162964, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "distill_loss": 0.21110394597053528, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "ref_ce_loss": 0.17169038951396942, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "loss": 0.8734752535820007, + "step": 450 + }, + { + "ce_loss": 0.390616238117218, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "distill_loss": 0.2063642293214798, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "ref_ce_loss": 0.15543001890182495, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "loss": 1.4934319257736206, + "step": 450 + }, + { + "ce_loss": 0.366504967212677, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "distill_loss": 0.2531406581401825, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "ref_ce_loss": 0.15823999047279358, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "loss": 1.0348056554794312, + "step": 450 + }, + { + "ce_loss": 0.4287627041339874, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "distill_loss": 0.23162701725959778, + "epoch": 0.15010006671114076, + "step": 450 + }, + { + "epoch": 0.15010006671114076, + "ref_ce_loss": 0.21052688360214233, + "step": 450 + }, + { + "epoch": 0.1534356237491661, + "loss": 1.1691, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "grad_norm": 2.9276604652404785, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "learning_rate": 0.00019166666666666665, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "loss": 0.9366768598556519, + "step": 460 + }, + { + "ce_loss": 0.3069738745689392, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "distill_loss": 0.1811765879392624, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "ref_ce_loss": 0.12331288307905197, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "loss": 0.792156994342804, + "step": 460 + }, + { + "ce_loss": 0.3994184136390686, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "distill_loss": 0.2316671758890152, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "ref_ce_loss": 0.1596464067697525, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "loss": 1.0726959705352783, + "step": 460 + }, + { + "ce_loss": 0.44579944014549255, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "distill_loss": 0.24677278101444244, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "ref_ce_loss": 0.20206046104431152, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "loss": 0.8775638341903687, + "step": 460 + }, + { + "ce_loss": 0.344332754611969, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "distill_loss": 0.2588733732700348, + "epoch": 0.1534356237491661, + "step": 460 + }, + { + "epoch": 0.1534356237491661, + "ref_ce_loss": 0.12321383506059647, + "step": 460 + }, + { + "epoch": 0.15677118078719146, + "loss": 1.0912, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "grad_norm": 3.4049301147460938, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "learning_rate": 0.00019583333333333331, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "loss": 1.1278324127197266, + "step": 470 + }, + { + "ce_loss": 0.4387667775154114, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "distill_loss": 0.23371437191963196, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "ref_ce_loss": 0.16273000836372375, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "loss": 1.114906668663025, + "step": 470 + }, + { + "ce_loss": 0.4878920316696167, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "distill_loss": 0.2571012079715729, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "ref_ce_loss": 0.22147198021411896, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "loss": 0.914812445640564, + "step": 470 + }, + { + "ce_loss": 0.35870662331581116, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "distill_loss": 0.2166028916835785, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "ref_ce_loss": 0.18772242963314056, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "loss": 0.9531413912773132, + "step": 470 + }, + { + "ce_loss": 0.4369311034679413, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "distill_loss": 0.23241980373859406, + "epoch": 0.15677118078719146, + "step": 470 + }, + { + "epoch": 0.15677118078719146, + "ref_ce_loss": 0.13508175313472748, + "step": 470 + }, + { + "epoch": 0.1601067378252168, + "loss": 1.101, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "grad_norm": 3.4803426265716553, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "learning_rate": 0.00019999999999999998, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "loss": 0.9322621822357178, + "step": 480 + }, + { + "ce_loss": 0.33916839957237244, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "distill_loss": 0.19862687587738037, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "ref_ce_loss": 0.16116870939731598, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "loss": 0.8902009129524231, + "step": 480 + }, + { + "ce_loss": 0.32456398010253906, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "distill_loss": 0.17489784955978394, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "ref_ce_loss": 0.22662509977817535, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "loss": 1.1041722297668457, + "step": 480 + }, + { + "ce_loss": 0.31113237142562866, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "distill_loss": 0.17790481448173523, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "ref_ce_loss": 0.15077772736549377, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "loss": 0.8598635196685791, + "step": 480 + }, + { + "ce_loss": 0.418891042470932, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "distill_loss": 0.18824335932731628, + "epoch": 0.1601067378252168, + "step": 480 + }, + { + "epoch": 0.1601067378252168, + "ref_ce_loss": 0.14995791018009186, + "step": 480 + }, + { + "epoch": 0.16344229486324216, + "loss": 1.0941, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "grad_norm": 3.9171133041381836, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "learning_rate": 0.00020416666666666665, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "loss": 1.0149155855178833, + "step": 490 + }, + { + "ce_loss": 0.45266851782798767, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "distill_loss": 0.1602305769920349, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "ref_ce_loss": 0.20514211058616638, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "loss": 1.6531803607940674, + "step": 490 + }, + { + "ce_loss": 0.3911672830581665, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "distill_loss": 0.14901621639728546, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "ref_ce_loss": 0.2036956399679184, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "loss": 0.800538182258606, + "step": 490 + }, + { + "ce_loss": 0.27649155259132385, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "distill_loss": 0.13379120826721191, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "ref_ce_loss": 0.24327047169208527, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "loss": 1.8821849822998047, + "step": 490 + }, + { + "ce_loss": 0.38936328887939453, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "distill_loss": 0.1746000498533249, + "epoch": 0.16344229486324216, + "step": 490 + }, + { + "epoch": 0.16344229486324216, + "ref_ce_loss": 0.17974820733070374, + "step": 490 + }, + { + "epoch": 0.1667778519012675, + "loss": 1.0735, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "grad_norm": 3.739142417907715, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "learning_rate": 0.00020833333333333332, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "loss": 0.7304775714874268, + "step": 500 + }, + { + "ce_loss": 0.34158605337142944, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "distill_loss": 0.11969535052776337, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "ref_ce_loss": 0.1446818709373474, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "loss": 1.7273657321929932, + "step": 500 + }, + { + "ce_loss": 0.40708187222480774, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "distill_loss": 0.12989500164985657, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "ref_ce_loss": 0.22536632418632507, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "loss": 1.4432916641235352, + "step": 500 + }, + { + "ce_loss": 0.33398616313934326, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "distill_loss": 0.1335625797510147, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "ref_ce_loss": 0.14719289541244507, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "loss": 1.2549631595611572, + "step": 500 + }, + { + "ce_loss": 0.43018224835395813, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "distill_loss": 0.10706700384616852, + "epoch": 0.1667778519012675, + "step": 500 + }, + { + "epoch": 0.1667778519012675, + "ref_ce_loss": 0.24464663863182068, + "step": 500 + }, + { + "epoch": 0.17011340893929286, + "loss": 1.0963, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "grad_norm": 2.8536627292633057, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "learning_rate": 0.0002125, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "loss": 0.7844828367233276, + "step": 510 + }, + { + "ce_loss": 0.34540170431137085, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "distill_loss": 0.1451631784439087, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "ref_ce_loss": 0.14832229912281036, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "loss": 1.0483617782592773, + "step": 510 + }, + { + "ce_loss": 0.5033784508705139, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "distill_loss": 0.13474875688552856, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "ref_ce_loss": 0.2690085768699646, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "loss": 1.3711141347885132, + "step": 510 + }, + { + "ce_loss": 0.5016711354255676, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "distill_loss": 0.15381048619747162, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "ref_ce_loss": 0.254338800907135, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "loss": 0.7214844822883606, + "step": 510 + }, + { + "ce_loss": 0.35924723744392395, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "distill_loss": 0.13219282031059265, + "epoch": 0.17011340893929286, + "step": 510 + }, + { + "epoch": 0.17011340893929286, + "ref_ce_loss": 0.22950786352157593, + "step": 510 + }, + { + "epoch": 0.1734489659773182, + "loss": 1.072, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "grad_norm": 7.5163092613220215, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "learning_rate": 0.00021666666666666666, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "loss": 1.590034008026123, + "step": 520 + }, + { + "ce_loss": 0.5328056812286377, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "distill_loss": 0.1510966718196869, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "ref_ce_loss": 0.2138897031545639, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "loss": 1.0373549461364746, + "step": 520 + }, + { + "ce_loss": 0.3898574709892273, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "distill_loss": 0.1519639492034912, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "ref_ce_loss": 0.24447669088840485, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "loss": 1.6956017017364502, + "step": 520 + }, + { + "ce_loss": 0.4165439009666443, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "distill_loss": 0.1603700965642929, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "ref_ce_loss": 0.10957357287406921, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "loss": 1.230751633644104, + "step": 520 + }, + { + "ce_loss": 0.4296768307685852, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "distill_loss": 0.17311978340148926, + "epoch": 0.1734489659773182, + "step": 520 + }, + { + "epoch": 0.1734489659773182, + "ref_ce_loss": 0.17816101014614105, + "step": 520 + }, + { + "epoch": 0.17678452301534356, + "loss": 1.2013, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "grad_norm": 4.662562370300293, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "learning_rate": 0.00022083333333333333, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "loss": 1.1115416288375854, + "step": 530 + }, + { + "ce_loss": 0.4564555883407593, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "distill_loss": 0.12773512303829193, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "ref_ce_loss": 0.19250312447547913, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "loss": 0.6754852533340454, + "step": 530 + }, + { + "ce_loss": 0.39747318625450134, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "distill_loss": 0.1195475161075592, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "ref_ce_loss": 0.15837079286575317, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "loss": 0.7832016944885254, + "step": 530 + }, + { + "ce_loss": 0.42709383368492126, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "distill_loss": 0.13008397817611694, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "ref_ce_loss": 0.22590842843055725, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "loss": 1.5747027397155762, + "step": 530 + }, + { + "ce_loss": 0.38369321823120117, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "distill_loss": 0.121522918343544, + "epoch": 0.17678452301534356, + "step": 530 + }, + { + "epoch": 0.17678452301534356, + "ref_ce_loss": 0.1542671173810959, + "step": 530 + }, + { + "epoch": 0.1801200800533689, + "loss": 1.0564, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "grad_norm": 3.5200061798095703, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "learning_rate": 0.000225, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "loss": 1.145336389541626, + "step": 540 + }, + { + "ce_loss": 0.4209885895252228, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "distill_loss": 0.13889750838279724, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "ref_ce_loss": 0.1695491373538971, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "loss": 0.8331067562103271, + "step": 540 + }, + { + "ce_loss": 0.3781631290912628, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "distill_loss": 0.1160370260477066, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "ref_ce_loss": 0.21670031547546387, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "loss": 0.7429934740066528, + "step": 540 + }, + { + "ce_loss": 0.4223313331604004, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "distill_loss": 0.1255422979593277, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "ref_ce_loss": 0.1942986696958542, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "loss": 0.8002361059188843, + "step": 540 + }, + { + "ce_loss": 0.3722546398639679, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "distill_loss": 0.13500186800956726, + "epoch": 0.1801200800533689, + "step": 540 + }, + { + "epoch": 0.1801200800533689, + "ref_ce_loss": 0.14406229555606842, + "step": 540 + }, + { + "epoch": 0.18345563709139426, + "loss": 1.0981, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "grad_norm": 7.129374980926514, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "learning_rate": 0.00022916666666666664, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "loss": 0.6342393159866333, + "step": 550 + }, + { + "ce_loss": 0.37034982442855835, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "distill_loss": 0.12606871128082275, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "ref_ce_loss": 0.13699214160442352, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "loss": 0.8261252641677856, + "step": 550 + }, + { + "ce_loss": 0.394279420375824, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "distill_loss": 0.14237342774868011, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "ref_ce_loss": 0.17166593670845032, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "loss": 0.9936450719833374, + "step": 550 + }, + { + "ce_loss": 0.4587689936161041, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "distill_loss": 0.13692550361156464, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "ref_ce_loss": 0.26241040229797363, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "loss": 0.7960110902786255, + "step": 550 + }, + { + "ce_loss": 0.3535270094871521, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "distill_loss": 0.13536620140075684, + "epoch": 0.18345563709139426, + "step": 550 + }, + { + "epoch": 0.18345563709139426, + "ref_ce_loss": 0.1410633772611618, + "step": 550 + }, + { + "epoch": 0.1867911941294196, + "loss": 0.9803, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "grad_norm": 3.267789602279663, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "learning_rate": 0.0002333333333333333, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "loss": 0.9305562973022461, + "step": 560 + }, + { + "ce_loss": 0.3439280390739441, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "distill_loss": 0.1248282864689827, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "ref_ce_loss": 0.24716688692569733, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "loss": 0.537109911441803, + "step": 560 + }, + { + "ce_loss": 0.31539902091026306, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "distill_loss": 0.13532862067222595, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "ref_ce_loss": 0.08404301851987839, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "loss": 0.8709670901298523, + "step": 560 + }, + { + "ce_loss": 0.39863720536231995, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "distill_loss": 0.14135649800300598, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "ref_ce_loss": 0.2198602557182312, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "loss": 1.1067215204238892, + "step": 560 + }, + { + "ce_loss": 0.42708060145378113, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "distill_loss": 0.13257859647274017, + "epoch": 0.1867911941294196, + "step": 560 + }, + { + "epoch": 0.1867911941294196, + "ref_ce_loss": 0.2437812089920044, + "step": 560 + }, + { + "epoch": 0.19012675116744496, + "loss": 0.9827, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "grad_norm": 2.548273801803589, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "learning_rate": 0.00023749999999999997, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "loss": 1.6599349975585938, + "step": 570 + }, + { + "ce_loss": 0.4800955653190613, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "distill_loss": 0.16220739483833313, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "ref_ce_loss": 0.17117564380168915, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "loss": 0.6720627546310425, + "step": 570 + }, + { + "ce_loss": 0.3572465479373932, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "distill_loss": 0.12056800723075867, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "ref_ce_loss": 0.19394969940185547, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "loss": 1.1998469829559326, + "step": 570 + }, + { + "ce_loss": 0.44026824831962585, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "distill_loss": 0.1431874930858612, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "ref_ce_loss": 0.1669449657201767, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "loss": 1.1252778768539429, + "step": 570 + }, + { + "ce_loss": 0.36948126554489136, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "distill_loss": 0.13540102541446686, + "epoch": 0.19012675116744496, + "step": 570 + }, + { + "epoch": 0.19012675116744496, + "ref_ce_loss": 0.1495905965566635, + "step": 570 + }, + { + "epoch": 0.1934623082054703, + "loss": 1.0028, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "grad_norm": 2.2215306758880615, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "learning_rate": 0.00024166666666666664, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "loss": 0.7160823941230774, + "step": 580 + }, + { + "ce_loss": 0.4436323046684265, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "distill_loss": 0.1151004433631897, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "ref_ce_loss": 0.1572589874267578, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "loss": 0.8652709722518921, + "step": 580 + }, + { + "ce_loss": 0.4033513367176056, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "distill_loss": 0.11897167563438416, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "ref_ce_loss": 0.1534203588962555, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "loss": 0.930794358253479, + "step": 580 + }, + { + "ce_loss": 0.3531077206134796, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "distill_loss": 0.11472751200199127, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "ref_ce_loss": 0.15658818185329437, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "loss": 1.2479609251022339, + "step": 580 + }, + { + "ce_loss": 0.31892189383506775, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "distill_loss": 0.12005530297756195, + "epoch": 0.1934623082054703, + "step": 580 + }, + { + "epoch": 0.1934623082054703, + "ref_ce_loss": 0.1711868792772293, + "step": 580 + }, + { + "epoch": 0.19679786524349566, + "loss": 0.9568, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "grad_norm": 2.501621723175049, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "learning_rate": 0.0002458333333333333, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "loss": 0.8282630443572998, + "step": 590 + }, + { + "ce_loss": 0.44370871782302856, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "distill_loss": 0.11649394035339355, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "ref_ce_loss": 0.1873226761817932, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "loss": 0.9475926160812378, + "step": 590 + }, + { + "ce_loss": 0.42322638630867004, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "distill_loss": 0.10067001730203629, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "ref_ce_loss": 0.2594554126262665, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "loss": 1.049269437789917, + "step": 590 + }, + { + "ce_loss": 0.35470736026763916, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "distill_loss": 0.09396913647651672, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "ref_ce_loss": 0.23191304504871368, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "loss": 0.9053672552108765, + "step": 590 + }, + { + "ce_loss": 0.43494299054145813, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "distill_loss": 0.10486435890197754, + "epoch": 0.19679786524349566, + "step": 590 + }, + { + "epoch": 0.19679786524349566, + "ref_ce_loss": 0.21197301149368286, + "step": 590 + }, + { + "epoch": 0.200133422281521, + "loss": 1.0136, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "grad_norm": 5.192570209503174, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "learning_rate": 0.00025, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "loss": 0.9283386468887329, + "step": 600 + }, + { + "ce_loss": 0.44486886262893677, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "distill_loss": 0.10690561681985855, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "ref_ce_loss": 0.17460843920707703, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "loss": 1.5753425359725952, + "step": 600 + }, + { + "ce_loss": 0.3931669592857361, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "distill_loss": 0.09033727645874023, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "ref_ce_loss": 0.1294490545988083, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "loss": 0.9043337106704712, + "step": 600 + }, + { + "ce_loss": 0.4034048318862915, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "distill_loss": 0.1026686429977417, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "ref_ce_loss": 0.22998681664466858, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "loss": 0.9695954322814941, + "step": 600 + }, + { + "ce_loss": 0.514153778553009, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "distill_loss": 0.10593391954898834, + "epoch": 0.200133422281521, + "step": 600 + }, + { + "epoch": 0.200133422281521, + "ref_ce_loss": 0.21736301481723785, + "step": 600 + }, + { + "epoch": 0.20346897931954636, + "loss": 0.9875, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "grad_norm": 4.711723327636719, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "learning_rate": 0.00025416666666666665, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "loss": 1.0265181064605713, + "step": 610 + }, + { + "ce_loss": 0.27343955636024475, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "distill_loss": 0.11366772651672363, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "ref_ce_loss": 0.25925183296203613, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "loss": 1.0089905261993408, + "step": 610 + }, + { + "ce_loss": 0.6128965020179749, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "distill_loss": 0.13944478332996368, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "ref_ce_loss": 0.25663983821868896, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "loss": 0.7002043724060059, + "step": 610 + }, + { + "ce_loss": 0.37525349855422974, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "distill_loss": 0.11815596371889114, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "ref_ce_loss": 0.20678366720676422, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "loss": 1.0840277671813965, + "step": 610 + }, + { + "ce_loss": 0.3404729664325714, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "distill_loss": 0.13921114802360535, + "epoch": 0.20346897931954636, + "step": 610 + }, + { + "epoch": 0.20346897931954636, + "ref_ce_loss": 0.19724339246749878, + "step": 610 + }, + { + "epoch": 0.2068045363575717, + "loss": 1.1648, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "grad_norm": 15.244519233703613, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "learning_rate": 0.00025833333333333334, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "loss": 0.7298845052719116, + "step": 620 + }, + { + "ce_loss": 0.41301995515823364, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "distill_loss": 0.12899278104305267, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "ref_ce_loss": 0.1292322278022766, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "loss": 0.9002193212509155, + "step": 620 + }, + { + "ce_loss": 0.2968343198299408, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "distill_loss": 0.11593446880578995, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "ref_ce_loss": 0.17411155998706818, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "loss": 0.7344695329666138, + "step": 620 + }, + { + "ce_loss": 0.29947271943092346, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "distill_loss": 0.14619185030460358, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "ref_ce_loss": 0.10363147407770157, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "loss": 1.373020887374878, + "step": 620 + }, + { + "ce_loss": 0.40310031175613403, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "distill_loss": 0.15024641156196594, + "epoch": 0.2068045363575717, + "step": 620 + }, + { + "epoch": 0.2068045363575717, + "ref_ce_loss": 0.17175301909446716, + "step": 620 + }, + { + "epoch": 0.21014009339559706, + "loss": 1.0749, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "grad_norm": 8.212667465209961, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "learning_rate": 0.0002625, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "loss": 0.8715625405311584, + "step": 630 + }, + { + "ce_loss": 0.35520607233047485, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "distill_loss": 0.1376957893371582, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "ref_ce_loss": 0.19228589534759521, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "loss": 0.74106764793396, + "step": 630 + }, + { + "ce_loss": 0.3812822997570038, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "distill_loss": 0.104681096971035, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "ref_ce_loss": 0.2551042437553406, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "loss": 1.4336597919464111, + "step": 630 + }, + { + "ce_loss": 0.425566166639328, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "distill_loss": 0.13053010404109955, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "ref_ce_loss": 0.1743241250514984, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "loss": 0.8435543775558472, + "step": 630 + }, + { + "ce_loss": 0.3314042091369629, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "distill_loss": 0.12516427040100098, + "epoch": 0.21014009339559706, + "step": 630 + }, + { + "epoch": 0.21014009339559706, + "ref_ce_loss": 0.1005224660038948, + "step": 630 + }, + { + "epoch": 0.2134756504336224, + "loss": 1.0402, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "grad_norm": 5.008931636810303, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "learning_rate": 0.0002666666666666666, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "loss": 0.9882537722587585, + "step": 640 + }, + { + "ce_loss": 0.3984259366989136, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "distill_loss": 0.10313582420349121, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "ref_ce_loss": 0.2773117125034332, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "loss": 0.9406678080558777, + "step": 640 + }, + { + "ce_loss": 0.40200868248939514, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "distill_loss": 0.09374626725912094, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "ref_ce_loss": 0.22686932981014252, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "loss": 0.7817229628562927, + "step": 640 + }, + { + "ce_loss": 0.37328627705574036, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "distill_loss": 0.10063036531209946, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "ref_ce_loss": 0.21581311523914337, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "loss": 0.9889649152755737, + "step": 640 + }, + { + "ce_loss": 0.3842744827270508, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "distill_loss": 0.10192930698394775, + "epoch": 0.2134756504336224, + "step": 640 + }, + { + "epoch": 0.2134756504336224, + "ref_ce_loss": 0.17884069681167603, + "step": 640 + }, + { + "epoch": 0.21681120747164775, + "loss": 1.0506, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "grad_norm": 4.354966640472412, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "learning_rate": 0.0002708333333333333, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "loss": 0.7693982720375061, + "step": 650 + }, + { + "ce_loss": 0.3526293933391571, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "distill_loss": 0.11822477728128433, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "ref_ce_loss": 0.15306073427200317, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "loss": 0.7925047874450684, + "step": 650 + }, + { + "ce_loss": 0.3771379888057709, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "distill_loss": 0.13101331889629364, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "ref_ce_loss": 0.15947893261909485, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "loss": 0.7453879117965698, + "step": 650 + }, + { + "ce_loss": 0.3240864872932434, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "distill_loss": 0.1208181232213974, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "ref_ce_loss": 0.16739091277122498, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "loss": 1.2144562005996704, + "step": 650 + }, + { + "ce_loss": 0.3694230020046234, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "distill_loss": 0.12667544186115265, + "epoch": 0.21681120747164775, + "step": 650 + }, + { + "epoch": 0.21681120747164775, + "ref_ce_loss": 0.1709766983985901, + "step": 650 + }, + { + "epoch": 0.2201467645096731, + "loss": 1.0487, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "grad_norm": 4.961103439331055, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "learning_rate": 0.00027499999999999996, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "loss": 0.696092426776886, + "step": 660 + }, + { + "ce_loss": 0.41480615735054016, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "distill_loss": 0.09278301894664764, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "ref_ce_loss": 0.18769042193889618, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "loss": 1.1655255556106567, + "step": 660 + }, + { + "ce_loss": 0.38108381628990173, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "distill_loss": 0.10685959458351135, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "ref_ce_loss": 0.2142338752746582, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "loss": 1.8354722261428833, + "step": 660 + }, + { + "ce_loss": 0.35269877314567566, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "distill_loss": 0.0982913225889206, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "ref_ce_loss": 0.18363001942634583, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "loss": 0.9739862680435181, + "step": 660 + }, + { + "ce_loss": 0.487895667552948, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "distill_loss": 0.11279213428497314, + "epoch": 0.2201467645096731, + "step": 660 + }, + { + "epoch": 0.2201467645096731, + "ref_ce_loss": 0.21531665325164795, + "step": 660 + }, + { + "epoch": 0.22348232154769845, + "loss": 0.9574, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "grad_norm": 2.3174378871917725, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "learning_rate": 0.00027916666666666666, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "loss": 1.0270733833312988, + "step": 670 + }, + { + "ce_loss": 0.4073409140110016, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "distill_loss": 0.08957529067993164, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "ref_ce_loss": 0.22530145943164825, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "loss": 0.8506837487220764, + "step": 670 + }, + { + "ce_loss": 0.3899139165878296, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "distill_loss": 0.08442845940589905, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "ref_ce_loss": 0.15433235466480255, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "loss": 1.01683509349823, + "step": 670 + }, + { + "ce_loss": 0.37784865498542786, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "distill_loss": 0.10341054946184158, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "ref_ce_loss": 0.1931767463684082, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "loss": 0.8917780518531799, + "step": 670 + }, + { + "ce_loss": 0.4849671423435211, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "distill_loss": 0.09322772920131683, + "epoch": 0.22348232154769845, + "step": 670 + }, + { + "epoch": 0.22348232154769845, + "ref_ce_loss": 0.17773941159248352, + "step": 670 + }, + { + "epoch": 0.2268178785857238, + "loss": 0.9316, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "grad_norm": 2.05058217048645, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "learning_rate": 0.0002833333333333333, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "loss": 0.7885412573814392, + "step": 680 + }, + { + "ce_loss": 0.4074735939502716, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "distill_loss": 0.1123306155204773, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "ref_ce_loss": 0.1499427706003189, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "loss": 1.295933485031128, + "step": 680 + }, + { + "ce_loss": 0.3566689193248749, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "distill_loss": 0.10098881274461746, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "ref_ce_loss": 0.21255967020988464, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "loss": 2.1791958808898926, + "step": 680 + }, + { + "ce_loss": 0.45526185631752014, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "distill_loss": 0.10729824006557465, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "ref_ce_loss": 0.20944558084011078, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "loss": 1.860978364944458, + "step": 680 + }, + { + "ce_loss": 0.5708116292953491, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "distill_loss": 0.12268486618995667, + "epoch": 0.2268178785857238, + "step": 680 + }, + { + "epoch": 0.2268178785857238, + "ref_ce_loss": 0.20244957506656647, + "step": 680 + }, + { + "epoch": 0.23015343562374915, + "loss": 1.1451, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "grad_norm": 5.780163288116455, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "learning_rate": 0.0002875, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "loss": 0.6632769107818604, + "step": 690 + }, + { + "ce_loss": 0.356410950422287, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "distill_loss": 0.11251110583543777, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "ref_ce_loss": 0.1943245828151703, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "loss": 0.8603688478469849, + "step": 690 + }, + { + "ce_loss": 0.46949803829193115, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "distill_loss": 0.12070250511169434, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "ref_ce_loss": 0.20107735693454742, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "loss": 0.6247305870056152, + "step": 690 + }, + { + "ce_loss": 0.38976994156837463, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "distill_loss": 0.11423636227846146, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "ref_ce_loss": 0.12047644704580307, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "loss": 1.4882597923278809, + "step": 690 + }, + { + "ce_loss": 0.426405131816864, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "distill_loss": 0.09722738713026047, + "epoch": 0.23015343562374915, + "step": 690 + }, + { + "epoch": 0.23015343562374915, + "ref_ce_loss": 0.2013929933309555, + "step": 690 + }, + { + "epoch": 0.2334889926617745, + "loss": 1.0173, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "grad_norm": 3.844639301300049, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "learning_rate": 0.00029166666666666664, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "loss": 1.1349132061004639, + "step": 700 + }, + { + "ce_loss": 0.3564150333404541, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "distill_loss": 0.09892964363098145, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "ref_ce_loss": 0.15781459212303162, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "loss": 0.7385631203651428, + "step": 700 + }, + { + "ce_loss": 0.3456648886203766, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "distill_loss": 0.09562936425209045, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "ref_ce_loss": 0.19819499552249908, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "loss": 1.3640516996383667, + "step": 700 + }, + { + "ce_loss": 0.44929933547973633, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "distill_loss": 0.09775910526514053, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "ref_ce_loss": 0.2679179906845093, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "loss": 0.7409743666648865, + "step": 700 + }, + { + "ce_loss": 0.322372704744339, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "distill_loss": 0.09759743511676788, + "epoch": 0.2334889926617745, + "step": 700 + }, + { + "epoch": 0.2334889926617745, + "ref_ce_loss": 0.19925224781036377, + "step": 700 + }, + { + "epoch": 0.23682454969979988, + "loss": 1.076, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "grad_norm": 3.4155962467193604, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "learning_rate": 0.00029583333333333333, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "loss": 0.8137826919555664, + "step": 710 + }, + { + "ce_loss": 0.3987344205379486, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "distill_loss": 0.12683498859405518, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "ref_ce_loss": 0.17653390765190125, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "loss": 0.8795384764671326, + "step": 710 + }, + { + "ce_loss": 0.4477671980857849, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "distill_loss": 0.1383477747440338, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "ref_ce_loss": 0.1700534224510193, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "loss": 0.7244356274604797, + "step": 710 + }, + { + "ce_loss": 0.3994516134262085, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "distill_loss": 0.11458105593919754, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "ref_ce_loss": 0.16034944355487823, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "loss": 1.2054007053375244, + "step": 710 + }, + { + "ce_loss": 0.31495407223701477, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "distill_loss": 0.11541106551885605, + "epoch": 0.23682454969979988, + "step": 710 + }, + { + "epoch": 0.23682454969979988, + "ref_ce_loss": 0.15973275899887085, + "step": 710 + }, + { + "epoch": 0.24016010673782523, + "loss": 0.9533, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "grad_norm": 2.230461597442627, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "learning_rate": 0.0003, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "loss": 1.5545556545257568, + "step": 720 + }, + { + "ce_loss": 0.43651020526885986, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "distill_loss": 0.09635943919420242, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "ref_ce_loss": 0.1669834852218628, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "loss": 0.8339600563049316, + "step": 720 + }, + { + "ce_loss": 0.37342774868011475, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "distill_loss": 0.10773156583309174, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "ref_ce_loss": 0.11810831725597382, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "loss": 1.0487580299377441, + "step": 720 + }, + { + "ce_loss": 0.38172048330307007, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "distill_loss": 0.09679360687732697, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "ref_ce_loss": 0.2521325349807739, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "loss": 0.7091606259346008, + "step": 720 + }, + { + "ce_loss": 0.3137376010417938, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "distill_loss": 0.08482564985752106, + "epoch": 0.24016010673782523, + "step": 720 + }, + { + "epoch": 0.24016010673782523, + "ref_ce_loss": 0.13457347452640533, + "step": 720 + }, + { + "epoch": 0.24349566377585058, + "loss": 0.9342, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "grad_norm": 6.439743518829346, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "learning_rate": 0.00029999986322958505, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "loss": 0.6478908061981201, + "step": 730 + }, + { + "ce_loss": 0.3937546908855438, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "distill_loss": 0.11202051490545273, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "ref_ce_loss": 0.14208683371543884, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "loss": 0.864059329032898, + "step": 730 + }, + { + "ce_loss": 0.3594917356967926, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "distill_loss": 0.11220365017652512, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "ref_ce_loss": 0.17011480033397675, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "loss": 1.1030160188674927, + "step": 730 + }, + { + "ce_loss": 0.2855687141418457, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "distill_loss": 0.09748795628547668, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "ref_ce_loss": 0.20837855339050293, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "loss": 0.8385856747627258, + "step": 730 + }, + { + "ce_loss": 0.4496913254261017, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "distill_loss": 0.11531084030866623, + "epoch": 0.24349566377585058, + "step": 730 + }, + { + "epoch": 0.24349566377585058, + "ref_ce_loss": 0.1728518158197403, + "step": 730 + }, + { + "epoch": 0.24683122081387593, + "loss": 0.9298, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "grad_norm": 5.409254550933838, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "learning_rate": 0.00029999945291858974, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "loss": 0.9843995571136475, + "step": 740 + }, + { + "ce_loss": 0.37632668018341064, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "distill_loss": 0.11238445341587067, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "ref_ce_loss": 0.20602065324783325, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "loss": 1.8616141080856323, + "step": 740 + }, + { + "ce_loss": 0.46412500739097595, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "distill_loss": 0.1517205834388733, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "ref_ce_loss": 0.18234631419181824, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "loss": 0.7307447791099548, + "step": 740 + }, + { + "ce_loss": 0.33168455958366394, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "distill_loss": 0.14137397706508636, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "ref_ce_loss": 0.1374725103378296, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "loss": 1.3032493591308594, + "step": 740 + }, + { + "ce_loss": 0.3302554488182068, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "distill_loss": 0.1149841845035553, + "epoch": 0.24683122081387593, + "step": 740 + }, + { + "epoch": 0.24683122081387593, + "ref_ce_loss": 0.16944146156311035, + "step": 740 + }, + { + "epoch": 0.2501667778519013, + "loss": 1.0939, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "grad_norm": 2.4583041667938232, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "learning_rate": 0.0002999987690677622, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "loss": 1.895957589149475, + "step": 750 + }, + { + "ce_loss": 0.4463769793510437, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "distill_loss": 0.1424015462398529, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "ref_ce_loss": 0.28784894943237305, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "loss": 0.755088210105896, + "step": 750 + }, + { + "ce_loss": 0.32497096061706543, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "distill_loss": 0.11568926274776459, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "ref_ce_loss": 0.3128649592399597, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "loss": 0.8456001877784729, + "step": 750 + }, + { + "ce_loss": 0.38667964935302734, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "distill_loss": 0.11649499833583832, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "ref_ce_loss": 0.2550145983695984, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "loss": 0.7525032758712769, + "step": 750 + }, + { + "ce_loss": 0.4029221832752228, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "distill_loss": 0.12815462052822113, + "epoch": 0.2501667778519013, + "step": 750 + }, + { + "epoch": 0.2501667778519013, + "ref_ce_loss": 0.22105097770690918, + "step": 750 + }, + { + "epoch": 0.25350233488992663, + "loss": 1.1464, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "grad_norm": 6.5162835121154785, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "learning_rate": 0.0002999978116783497, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "loss": 1.1063671112060547, + "step": 760 + }, + { + "ce_loss": 0.3920098841190338, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "distill_loss": 0.48732489347457886, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "ref_ce_loss": 0.2269572615623474, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "loss": 2.2254278659820557, + "step": 760 + }, + { + "ce_loss": 0.4064970314502716, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "distill_loss": 0.519298255443573, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "ref_ce_loss": 0.2351590245962143, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "loss": 1.4916338920593262, + "step": 760 + }, + { + "ce_loss": 0.4203934073448181, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "distill_loss": 0.5812166929244995, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "ref_ce_loss": 0.19478872418403625, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "loss": 1.521181583404541, + "step": 760 + }, + { + "ce_loss": 0.41617247462272644, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "distill_loss": 0.5565536618232727, + "epoch": 0.25350233488992663, + "step": 760 + }, + { + "epoch": 0.25350233488992663, + "ref_ce_loss": 0.20470012724399567, + "step": 760 + }, + { + "epoch": 0.256837891927952, + "loss": 1.2578, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "grad_norm": 3.5118868350982666, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "learning_rate": 0.00029999658075209785, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "loss": 1.4006614685058594, + "step": 770 + }, + { + "ce_loss": 0.40021345019340515, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "distill_loss": 0.6183995008468628, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "ref_ce_loss": 0.1635797619819641, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "loss": 2.101757526397705, + "step": 770 + }, + { + "ce_loss": 0.3580198287963867, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "distill_loss": 0.5030682682991028, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "ref_ce_loss": 0.19172348082065582, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "loss": 1.6819496154785156, + "step": 770 + }, + { + "ce_loss": 0.35157284140586853, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "distill_loss": 0.5677676796913147, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "ref_ce_loss": 0.19775229692459106, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "loss": 1.4923537969589233, + "step": 770 + }, + { + "ce_loss": 0.3099478781223297, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "distill_loss": 0.4962100386619568, + "epoch": 0.256837891927952, + "step": 770 + }, + { + "epoch": 0.256837891927952, + "ref_ce_loss": 0.25995033979415894, + "step": 770 + }, + { + "epoch": 0.2601734489659773, + "loss": 1.1482, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "grad_norm": 4.092563152313232, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "learning_rate": 0.0002999950762912516, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "loss": 0.9934489727020264, + "step": 780 + }, + { + "ce_loss": 0.4670727849006653, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "distill_loss": 0.21604394912719727, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "ref_ce_loss": 0.21766111254692078, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "loss": 0.7785146832466125, + "step": 780 + }, + { + "ce_loss": 0.4251066744327545, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "distill_loss": 0.23343425989151, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "ref_ce_loss": 0.11992902308702469, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "loss": 0.9308755397796631, + "step": 780 + }, + { + "ce_loss": 0.43334394693374634, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "distill_loss": 0.22830483317375183, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "ref_ce_loss": 0.15851473808288574, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "loss": 1.02692711353302, + "step": 780 + }, + { + "ce_loss": 0.4778960645198822, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "distill_loss": 0.27986395359039307, + "epoch": 0.2601734489659773, + "step": 780 + }, + { + "epoch": 0.2601734489659773, + "ref_ce_loss": 0.14882494509220123, + "step": 780 + }, + { + "epoch": 0.2635090060040027, + "loss": 1.1524, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "grad_norm": 5.4341020584106445, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "learning_rate": 0.00029999329829855445, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "loss": 0.7699970006942749, + "step": 790 + }, + { + "ce_loss": 0.3887271285057068, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "distill_loss": 0.1653989851474762, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "ref_ce_loss": 0.21547864377498627, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "loss": 0.9414461851119995, + "step": 790 + }, + { + "ce_loss": 0.44151678681373596, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "distill_loss": 0.15568041801452637, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "ref_ce_loss": 0.1594770848751068, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "loss": 0.846574068069458, + "step": 790 + }, + { + "ce_loss": 0.36872443556785583, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "distill_loss": 0.15134361386299133, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "ref_ce_loss": 0.21225082874298096, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "loss": 1.0768730640411377, + "step": 790 + }, + { + "ce_loss": 0.4060359299182892, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "distill_loss": 0.1718452423810959, + "epoch": 0.2635090060040027, + "step": 790 + }, + { + "epoch": 0.2635090060040027, + "ref_ce_loss": 0.1805437058210373, + "step": 790 + }, + { + "epoch": 0.266844563042028, + "loss": 0.9391, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "grad_norm": 2.150057554244995, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "learning_rate": 0.0002999912467772487, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "loss": 0.8158339262008667, + "step": 800 + }, + { + "ce_loss": 0.4157412648200989, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "distill_loss": 0.14980031549930573, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "ref_ce_loss": 0.15453822910785675, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "loss": 0.7618894577026367, + "step": 800 + }, + { + "ce_loss": 0.34882333874702454, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "distill_loss": 0.14651793241500854, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "ref_ce_loss": 0.13346315920352936, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "loss": 1.3546881675720215, + "step": 800 + }, + { + "ce_loss": 0.43503326177597046, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "distill_loss": 0.1588219702243805, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "ref_ce_loss": 0.254705548286438, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "loss": 1.3324687480926514, + "step": 800 + }, + { + "ce_loss": 0.446909099817276, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "distill_loss": 0.1556074321269989, + "epoch": 0.266844563042028, + "step": 800 + }, + { + "epoch": 0.266844563042028, + "ref_ce_loss": 0.12691357731819153, + "step": 800 + }, + { + "epoch": 0.2701801200800534, + "loss": 0.9072, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "grad_norm": 3.318474054336548, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "learning_rate": 0.0002999889217310755, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "loss": 0.8637751340866089, + "step": 810 + }, + { + "ce_loss": 0.4324488341808319, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "distill_loss": 0.19835472106933594, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "ref_ce_loss": 0.23256337642669678, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "loss": 0.8926365375518799, + "step": 810 + }, + { + "ce_loss": 0.3715226352214813, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "distill_loss": 0.24980111420154572, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "ref_ce_loss": 0.27107831835746765, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "loss": 1.0381509065628052, + "step": 810 + }, + { + "ce_loss": 0.42502278089523315, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "distill_loss": 0.22801654040813446, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "ref_ce_loss": 0.18256254494190216, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "loss": 1.7744220495224, + "step": 810 + }, + { + "ce_loss": 0.4806549549102783, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "distill_loss": 0.18088483810424805, + "epoch": 0.2701801200800534, + "step": 810 + }, + { + "epoch": 0.2701801200800534, + "ref_ce_loss": 0.2571941316127777, + "step": 810 + }, + { + "epoch": 0.2735156771180787, + "loss": 1.1348, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "grad_norm": 4.989599227905273, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "learning_rate": 0.00029998632316427493, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "loss": 0.7250751852989197, + "step": 820 + }, + { + "ce_loss": 0.34602952003479004, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "distill_loss": 0.1138123869895935, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "ref_ce_loss": 0.1507563441991806, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "loss": 0.9217355251312256, + "step": 820 + }, + { + "ce_loss": 0.37067872285842896, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "distill_loss": 0.12390212714672089, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "ref_ce_loss": 0.1725834459066391, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "loss": 0.6601697206497192, + "step": 820 + }, + { + "ce_loss": 0.35830461978912354, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "distill_loss": 0.1254177987575531, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "ref_ce_loss": 0.17583151161670685, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "loss": 0.9171438217163086, + "step": 820 + }, + { + "ce_loss": 0.3662046194076538, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "distill_loss": 0.10298191010951996, + "epoch": 0.2735156771180787, + "step": 820 + }, + { + "epoch": 0.2735156771180787, + "ref_ce_loss": 0.16088753938674927, + "step": 820 + }, + { + "epoch": 0.2768512341561041, + "loss": 1.0359, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "grad_norm": 6.688352584838867, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "learning_rate": 0.0002999834510815857, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "loss": 0.6852495670318604, + "step": 830 + }, + { + "ce_loss": 0.27872976660728455, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "distill_loss": 0.10291218012571335, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "ref_ce_loss": 0.1563701629638672, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "loss": 1.3477394580841064, + "step": 830 + }, + { + "ce_loss": 0.3723379969596863, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "distill_loss": 0.12437531352043152, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "ref_ce_loss": 0.22796234488487244, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "loss": 0.8273101449012756, + "step": 830 + }, + { + "ce_loss": 0.37694263458251953, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "distill_loss": 0.108419269323349, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "ref_ce_loss": 0.2073294222354889, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "loss": 1.1366130113601685, + "step": 830 + }, + { + "ce_loss": 0.5759130120277405, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "distill_loss": 0.12910136580467224, + "epoch": 0.2768512341561041, + "step": 830 + }, + { + "epoch": 0.2768512341561041, + "ref_ce_loss": 0.2354922592639923, + "step": 830 + }, + { + "epoch": 0.2801867911941294, + "loss": 1.2926, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "grad_norm": 48.90994644165039, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "learning_rate": 0.00029998030548824525, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "loss": 1.7805827856063843, + "step": 840 + }, + { + "ce_loss": 0.3729347884654999, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "distill_loss": 1.0355364084243774, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "ref_ce_loss": 0.2145635336637497, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "loss": 1.8465008735656738, + "step": 840 + }, + { + "ce_loss": 0.4972056746482849, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "distill_loss": 1.098988652229309, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "ref_ce_loss": 0.25016623735427856, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "loss": 2.252458095550537, + "step": 840 + }, + { + "ce_loss": 0.49673929810523987, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "distill_loss": 1.036517858505249, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "ref_ce_loss": 0.27009356021881104, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "loss": 2.5026607513427734, + "step": 840 + }, + { + "ce_loss": 0.4665423333644867, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "distill_loss": 0.9117804765701294, + "epoch": 0.2801867911941294, + "step": 840 + }, + { + "epoch": 0.2801867911941294, + "ref_ce_loss": 0.27563977241516113, + "step": 840 + }, + { + "epoch": 0.2835223482321548, + "loss": 1.4018, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "grad_norm": 2.755733013153076, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "learning_rate": 0.0002999768863899901, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "loss": 1.6243078708648682, + "step": 850 + }, + { + "ce_loss": 0.3918367624282837, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "distill_loss": 0.7303428053855896, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "ref_ce_loss": 0.17016659677028656, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "loss": 1.8513051271438599, + "step": 850 + }, + { + "ce_loss": 0.32263848185539246, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "distill_loss": 0.661507248878479, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "ref_ce_loss": 0.14734257757663727, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "loss": 1.4546396732330322, + "step": 850 + }, + { + "ce_loss": 0.3858022689819336, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "distill_loss": 0.6699823141098022, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "ref_ce_loss": 0.16758663952350616, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "loss": 1.7295818328857422, + "step": 850 + }, + { + "ce_loss": 0.3436650037765503, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "distill_loss": 0.7292658686637878, + "epoch": 0.2835223482321548, + "step": 850 + }, + { + "epoch": 0.2835223482321548, + "ref_ce_loss": 0.155188649892807, + "step": 850 + }, + { + "epoch": 0.2868579052701801, + "loss": 1.229, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "grad_norm": 3.504427194595337, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "learning_rate": 0.00029997319379305515, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "loss": 0.9320521354675293, + "step": 860 + }, + { + "ce_loss": 0.28551921248435974, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "distill_loss": 0.2454386055469513, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "ref_ce_loss": 0.19818471372127533, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "loss": 1.5051312446594238, + "step": 860 + }, + { + "ce_loss": 0.42141714692115784, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "distill_loss": 0.29946091771125793, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "ref_ce_loss": 0.1941777914762497, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "loss": 0.8788027763366699, + "step": 860 + }, + { + "ce_loss": 0.3769928216934204, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "distill_loss": 0.24679483473300934, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "ref_ce_loss": 0.17477509379386902, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "loss": 1.0972120761871338, + "step": 860 + }, + { + "ce_loss": 0.3971550464630127, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "distill_loss": 0.2257571965456009, + "epoch": 0.2868579052701801, + "step": 860 + }, + { + "epoch": 0.2868579052701801, + "ref_ce_loss": 0.23073622584342957, + "step": 860 + }, + { + "epoch": 0.2901934623082055, + "loss": 1.2198, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "grad_norm": 2.217575788497925, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "learning_rate": 0.00029996922770417434, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "loss": 1.7647467851638794, + "step": 870 + }, + { + "ce_loss": 0.39066219329833984, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "distill_loss": 0.27164989709854126, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "ref_ce_loss": 0.16236373782157898, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "loss": 1.2479718923568726, + "step": 870 + }, + { + "ce_loss": 0.39999279379844666, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "distill_loss": 0.3134003281593323, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "ref_ce_loss": 0.23171456158161163, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "loss": 1.099112868309021, + "step": 870 + }, + { + "ce_loss": 0.4728688895702362, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "distill_loss": 0.32087886333465576, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "ref_ce_loss": 0.16885533928871155, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "loss": 1.2090306282043457, + "step": 870 + }, + { + "ce_loss": 0.32257935404777527, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "distill_loss": 0.2634151577949524, + "epoch": 0.2901934623082055, + "step": 870 + }, + { + "epoch": 0.2901934623082055, + "ref_ce_loss": 0.19036482274532318, + "step": 870 + }, + { + "epoch": 0.2935290193462308, + "loss": 1.0661, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "grad_norm": 2.9561502933502197, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "learning_rate": 0.00029996498813058024, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "loss": 0.9875502586364746, + "step": 880 + }, + { + "ce_loss": 0.4626186788082123, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "distill_loss": 0.2516644597053528, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "ref_ce_loss": 0.17623616755008698, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "loss": 0.6726231575012207, + "step": 880 + }, + { + "ce_loss": 0.2600361108779907, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "distill_loss": 0.1706208437681198, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "ref_ce_loss": 0.15342611074447632, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "loss": 0.5651594996452332, + "step": 880 + }, + { + "ce_loss": 0.24951525032520294, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "distill_loss": 0.13484697043895721, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "ref_ce_loss": 0.1341100037097931, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "loss": 1.158497929573059, + "step": 880 + }, + { + "ce_loss": 0.2759685218334198, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "distill_loss": 0.20097249746322632, + "epoch": 0.2935290193462308, + "step": 880 + }, + { + "epoch": 0.2935290193462308, + "ref_ce_loss": 0.18886953592300415, + "step": 880 + }, + { + "epoch": 0.2968645763842562, + "loss": 1.0493, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "grad_norm": 4.124588966369629, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "learning_rate": 0.0002999604750800042, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "loss": 1.3233633041381836, + "step": 890 + }, + { + "ce_loss": 0.43904969096183777, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "distill_loss": 0.20250177383422852, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "ref_ce_loss": 0.26501935720443726, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "loss": 0.9193978905677795, + "step": 890 + }, + { + "ce_loss": 0.4037984013557434, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "distill_loss": 0.17271189391613007, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "ref_ce_loss": 0.21354354918003082, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "loss": 0.8298638463020325, + "step": 890 + }, + { + "ce_loss": 0.4446713626384735, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "distill_loss": 0.18691898882389069, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "ref_ce_loss": 0.19824057817459106, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "loss": 0.8961043953895569, + "step": 890 + }, + { + "ce_loss": 0.3133203983306885, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "distill_loss": 0.18369875848293304, + "epoch": 0.2968645763842562, + "step": 890 + }, + { + "epoch": 0.2968645763842562, + "ref_ce_loss": 0.14095714688301086, + "step": 890 + }, + { + "epoch": 0.3002001334222815, + "loss": 0.9395, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "grad_norm": 3.79996657371521, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "learning_rate": 0.0002999556885606761, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "loss": 0.4957481324672699, + "step": 900 + }, + { + "ce_loss": 0.2566486597061157, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "distill_loss": 0.11270153522491455, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "ref_ce_loss": 0.1263445019721985, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "loss": 0.7489277124404907, + "step": 900 + }, + { + "ce_loss": 0.32021090388298035, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "distill_loss": 0.11752177029848099, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "ref_ce_loss": 0.1603088527917862, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "loss": 0.8972264528274536, + "step": 900 + }, + { + "ce_loss": 0.41744017601013184, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "distill_loss": 0.137289896607399, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "ref_ce_loss": 0.2510444223880768, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "loss": 1.0207258462905884, + "step": 900 + }, + { + "ce_loss": 0.4178427755832672, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "distill_loss": 0.1229424700140953, + "epoch": 0.3002001334222815, + "step": 900 + }, + { + "epoch": 0.3002001334222815, + "ref_ce_loss": 0.18919920921325684, + "step": 900 + }, + { + "epoch": 0.3035356904603069, + "loss": 0.9636, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "grad_norm": 2.994086742401123, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "learning_rate": 0.00029995062858132485, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "loss": 0.9765963554382324, + "step": 910 + }, + { + "ce_loss": 0.3894851505756378, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "distill_loss": 0.1433676779270172, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "ref_ce_loss": 0.13310347497463226, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "loss": 1.011177659034729, + "step": 910 + }, + { + "ce_loss": 0.3391077220439911, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "distill_loss": 0.1197926327586174, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "ref_ce_loss": 0.19934594631195068, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "loss": 0.6123791337013245, + "step": 910 + }, + { + "ce_loss": 0.26249223947525024, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "distill_loss": 0.1554940640926361, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "ref_ce_loss": 0.08239661157131195, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "loss": 1.058248519897461, + "step": 910 + }, + { + "ce_loss": 0.39104774594306946, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "distill_loss": 0.1529640257358551, + "epoch": 0.3035356904603069, + "step": 910 + }, + { + "epoch": 0.3035356904603069, + "ref_ce_loss": 0.1721760630607605, + "step": 910 + }, + { + "epoch": 0.3068712474983322, + "loss": 0.954, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "grad_norm": 3.0830633640289307, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "learning_rate": 0.00029994529515117767, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "loss": 1.0534474849700928, + "step": 920 + }, + { + "ce_loss": 0.4601081311702728, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "distill_loss": 0.1674957424402237, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "ref_ce_loss": 0.13673503696918488, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "loss": 1.3385803699493408, + "step": 920 + }, + { + "ce_loss": 0.3786323368549347, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "distill_loss": 0.16205266118049622, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "ref_ce_loss": 0.16322043538093567, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "loss": 0.7872010469436646, + "step": 920 + }, + { + "ce_loss": 0.40525513887405396, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "distill_loss": 0.15195631980895996, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "ref_ce_loss": 0.1313735991716385, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "loss": 0.9474194049835205, + "step": 920 + }, + { + "ce_loss": 0.36184266209602356, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "distill_loss": 0.1604013741016388, + "epoch": 0.3068712474983322, + "step": 920 + }, + { + "epoch": 0.3068712474983322, + "ref_ce_loss": 0.16650882363319397, + "step": 920 + }, + { + "epoch": 0.31020680453635757, + "loss": 1.0615, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "grad_norm": 6.1084465980529785, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "learning_rate": 0.0002999396882799608, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "loss": 0.8135707974433899, + "step": 930 + }, + { + "ce_loss": 0.48061808943748474, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "distill_loss": 0.12107715755701065, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "ref_ce_loss": 0.21182826161384583, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "loss": 1.6873639822006226, + "step": 930 + }, + { + "ce_loss": 0.4000921845436096, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "distill_loss": 0.11861014366149902, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "ref_ce_loss": 0.19486136734485626, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "loss": 0.6255282163619995, + "step": 930 + }, + { + "ce_loss": 0.29100140929222107, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "distill_loss": 0.11383562535047531, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "ref_ce_loss": 0.16077548265457153, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "loss": 1.0830172300338745, + "step": 930 + }, + { + "ce_loss": 0.406831830739975, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "distill_loss": 0.12229843437671661, + "epoch": 0.31020680453635757, + "step": 930 + }, + { + "epoch": 0.31020680453635757, + "ref_ce_loss": 0.18925532698631287, + "step": 930 + }, + { + "epoch": 0.3135423615743829, + "loss": 1.0012, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "grad_norm": 2.763657808303833, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "learning_rate": 0.00029993380797789884, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "loss": 1.2902885675430298, + "step": 940 + }, + { + "ce_loss": 0.5113952159881592, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "distill_loss": 0.15199342370033264, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "ref_ce_loss": 0.18690067529678345, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "loss": 0.7129663825035095, + "step": 940 + }, + { + "ce_loss": 0.40314140915870667, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "distill_loss": 0.1183619499206543, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "ref_ce_loss": 0.18934160470962524, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "loss": 0.8741893768310547, + "step": 940 + }, + { + "ce_loss": 0.3346640467643738, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "distill_loss": 0.13830089569091797, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "ref_ce_loss": 0.1432468295097351, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "loss": 0.8341860771179199, + "step": 940 + }, + { + "ce_loss": 0.418517142534256, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "distill_loss": 0.1469680368900299, + "epoch": 0.3135423615743829, + "step": 940 + }, + { + "epoch": 0.3135423615743829, + "ref_ce_loss": 0.13575048744678497, + "step": 940 + }, + { + "epoch": 0.31687791861240827, + "loss": 0.9327, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "grad_norm": 3.665161371231079, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "learning_rate": 0.0002999276542557152, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "loss": 0.7519707083702087, + "step": 950 + }, + { + "ce_loss": 0.302935928106308, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "distill_loss": 0.13023529946804047, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "ref_ce_loss": 0.17454656958580017, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "loss": 1.0623005628585815, + "step": 950 + }, + { + "ce_loss": 0.309317946434021, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "distill_loss": 0.13614854216575623, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "ref_ce_loss": 0.20230692625045776, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "loss": 1.529290795326233, + "step": 950 + }, + { + "ce_loss": 0.281059592962265, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "distill_loss": 0.13188424706459045, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "ref_ce_loss": 0.19181722402572632, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "loss": 1.0552257299423218, + "step": 950 + }, + { + "ce_loss": 0.3567151427268982, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "distill_loss": 0.14000089466571808, + "epoch": 0.31687791861240827, + "step": 950 + }, + { + "epoch": 0.31687791861240827, + "ref_ce_loss": 0.18195955455303192, + "step": 950 + }, + { + "epoch": 0.3202134756504336, + "loss": 0.9537, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "grad_norm": 2.9725399017333984, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "learning_rate": 0.00029992122712463185, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "loss": 0.8172087073326111, + "step": 960 + }, + { + "ce_loss": 0.4168473482131958, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "distill_loss": 0.12376371771097183, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "ref_ce_loss": 0.27427318692207336, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "loss": 0.7152084708213806, + "step": 960 + }, + { + "ce_loss": 0.3916681706905365, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "distill_loss": 0.1268494725227356, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "ref_ce_loss": 0.19664986431598663, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "loss": 0.750626802444458, + "step": 960 + }, + { + "ce_loss": 0.358602911233902, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "distill_loss": 0.12475446611642838, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "ref_ce_loss": 0.16163264214992523, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "loss": 0.7146525979042053, + "step": 960 + }, + { + "ce_loss": 0.30217331647872925, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "distill_loss": 0.13478901982307434, + "epoch": 0.3202134756504336, + "step": 960 + }, + { + "epoch": 0.3202134756504336, + "ref_ce_loss": 0.1618402898311615, + "step": 960 + }, + { + "epoch": 0.32354903268845897, + "loss": 0.9339, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "grad_norm": 4.9782819747924805, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "learning_rate": 0.0002999145265963693, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "loss": 0.7821298241615295, + "step": 970 + }, + { + "ce_loss": 0.4171198606491089, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "distill_loss": 0.1375242918729782, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "ref_ce_loss": 0.17934392392635345, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "loss": 1.0573623180389404, + "step": 970 + }, + { + "ce_loss": 0.42946135997772217, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "distill_loss": 0.12328819185495377, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "ref_ce_loss": 0.16554884612560272, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "loss": 1.2828457355499268, + "step": 970 + }, + { + "ce_loss": 0.4472983777523041, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "distill_loss": 0.13271930813789368, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "ref_ce_loss": 0.201155424118042, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "loss": 1.5741230249404907, + "step": 970 + }, + { + "ce_loss": 0.3056272864341736, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "distill_loss": 0.10602132976055145, + "epoch": 0.32354903268845897, + "step": 970 + }, + { + "epoch": 0.32354903268845897, + "ref_ce_loss": 0.2535144090652466, + "step": 970 + }, + { + "epoch": 0.3268845897264843, + "loss": 1.0272, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "grad_norm": 6.052799701690674, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "learning_rate": 0.00029990755268314667, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "loss": 0.7132030129432678, + "step": 980 + }, + { + "ce_loss": 0.30206334590911865, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "distill_loss": 0.10779759287834167, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "ref_ce_loss": 0.18162314593791962, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "loss": 0.8841077089309692, + "step": 980 + }, + { + "ce_loss": 0.37281209230422974, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "distill_loss": 0.11574500054121017, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "ref_ce_loss": 0.2086096853017807, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "loss": 0.9464960098266602, + "step": 980 + }, + { + "ce_loss": 0.4134019613265991, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "distill_loss": 0.1166180819272995, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "ref_ce_loss": 0.24008263647556305, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "loss": 0.9239301681518555, + "step": 980 + }, + { + "ce_loss": 0.3819361627101898, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "distill_loss": 0.10396760702133179, + "epoch": 0.3268845897264843, + "step": 980 + }, + { + "epoch": 0.3268845897264843, + "ref_ce_loss": 0.23532161116600037, + "step": 980 + }, + { + "epoch": 0.33022014676450967, + "loss": 0.8154, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "grad_norm": 4.464677333831787, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "learning_rate": 0.00029990030539768167, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "loss": 1.126884937286377, + "step": 990 + }, + { + "ce_loss": 0.2962307333946228, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "distill_loss": 0.11264996230602264, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "ref_ce_loss": 0.15359817445278168, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "loss": 0.7011892795562744, + "step": 990 + }, + { + "ce_loss": 0.35758545994758606, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "distill_loss": 0.11551101505756378, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "ref_ce_loss": 0.12288648635149002, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "loss": 0.7315992116928101, + "step": 990 + }, + { + "ce_loss": 0.29825419187545776, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "distill_loss": 0.09923024475574493, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "ref_ce_loss": 0.15791910886764526, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "loss": 1.1659493446350098, + "step": 990 + }, + { + "ce_loss": 0.4181416928768158, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "distill_loss": 0.1343041956424713, + "epoch": 0.33022014676450967, + "step": 990 + }, + { + "epoch": 0.33022014676450967, + "ref_ce_loss": 0.12854261696338654, + "step": 990 + }, + { + "epoch": 0.333555703802535, + "loss": 0.9506, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "grad_norm": 8.558215141296387, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "learning_rate": 0.0002998927847531905, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "loss": 1.2239561080932617, + "step": 1000 + }, + { + "ce_loss": 0.3037545382976532, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "distill_loss": 0.6090459823608398, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "ref_ce_loss": 0.16965021193027496, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "loss": 1.0540249347686768, + "step": 1000 + }, + { + "ce_loss": 0.3109924793243408, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "distill_loss": 0.5928265452384949, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "ref_ce_loss": 0.14983440935611725, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "loss": 1.3430770635604858, + "step": 1000 + }, + { + "ce_loss": 0.37822142243385315, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "distill_loss": 0.702450156211853, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "ref_ce_loss": 0.1507684737443924, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "loss": 1.3171063661575317, + "step": 1000 + }, + { + "ce_loss": 0.3266257643699646, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "distill_loss": 0.7057124376296997, + "epoch": 0.333555703802535, + "step": 1000 + }, + { + "epoch": 0.333555703802535, + "ref_ce_loss": 0.1825946867465973, + "step": 1000 + }, + { + "epoch": 0.33689126084056037, + "loss": 1.1746, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "grad_norm": 2.476720094680786, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "learning_rate": 0.0002998849907633878, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "loss": 0.8267805576324463, + "step": 1010 + }, + { + "ce_loss": 0.2990218698978424, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "distill_loss": 0.21271488070487976, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "ref_ce_loss": 0.175185889005661, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "loss": 0.7139723896980286, + "step": 1010 + }, + { + "ce_loss": 0.3102499544620514, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "distill_loss": 0.22610852122306824, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "ref_ce_loss": 0.1776013821363449, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "loss": 0.8357424736022949, + "step": 1010 + }, + { + "ce_loss": 0.3024822175502777, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "distill_loss": 0.2356206774711609, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "ref_ce_loss": 0.2250850945711136, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "loss": 1.2665164470672607, + "step": 1010 + }, + { + "ce_loss": 0.3598532974720001, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "distill_loss": 0.24678495526313782, + "epoch": 0.33689126084056037, + "step": 1010 + }, + { + "epoch": 0.33689126084056037, + "ref_ce_loss": 0.157108873128891, + "step": 1010 + }, + { + "epoch": 0.3402268178785857, + "loss": 1.0136, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "grad_norm": 3.2634665966033936, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "learning_rate": 0.0002998769234424868, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "loss": 0.7189880609512329, + "step": 1020 + }, + { + "ce_loss": 0.3308708369731903, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "distill_loss": 0.19060233235359192, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "ref_ce_loss": 0.13148698210716248, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "loss": 0.709821879863739, + "step": 1020 + }, + { + "ce_loss": 0.33700650930404663, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "distill_loss": 0.19593364000320435, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "ref_ce_loss": 0.1766604334115982, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "loss": 0.6329488158226013, + "step": 1020 + }, + { + "ce_loss": 0.3196988105773926, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "distill_loss": 0.19230948388576508, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "ref_ce_loss": 0.1209319457411766, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "loss": 1.4003040790557861, + "step": 1020 + }, + { + "ce_loss": 0.3359232246875763, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "distill_loss": 0.22875024378299713, + "epoch": 0.3402268178785857, + "step": 1020 + }, + { + "epoch": 0.3402268178785857, + "ref_ce_loss": 0.18941046297550201, + "step": 1020 + }, + { + "epoch": 0.34356237491661107, + "loss": 0.9281, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "grad_norm": 2.158473253250122, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "learning_rate": 0.00029986858280519897, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "loss": 1.0241984128952026, + "step": 1030 + }, + { + "ce_loss": 0.4109732210636139, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "distill_loss": 0.22403854131698608, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "ref_ce_loss": 0.19617129862308502, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "loss": 0.9698517322540283, + "step": 1030 + }, + { + "ce_loss": 0.33989307284355164, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "distill_loss": 0.21469691395759583, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "ref_ce_loss": 0.1773064285516739, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "loss": 1.1452717781066895, + "step": 1030 + }, + { + "ce_loss": 0.374833881855011, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "distill_loss": 0.21497607231140137, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "ref_ce_loss": 0.20118488371372223, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "loss": 1.1008607149124146, + "step": 1030 + }, + { + "ce_loss": 0.367872416973114, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "distill_loss": 0.22034002840518951, + "epoch": 0.34356237491661107, + "step": 1030 + }, + { + "epoch": 0.34356237491661107, + "ref_ce_loss": 0.21571652591228485, + "step": 1030 + }, + { + "epoch": 0.3468979319546364, + "loss": 1.1223, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "grad_norm": 5.033119201660156, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "learning_rate": 0.0002998599688667345, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "loss": 0.5317191481590271, + "step": 1040 + }, + { + "ce_loss": 0.2717324495315552, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "distill_loss": 0.13226278126239777, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "ref_ce_loss": 0.12715670466423035, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "loss": 1.1311434507369995, + "step": 1040 + }, + { + "ce_loss": 0.5449992418289185, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "distill_loss": 0.2138533592224121, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "ref_ce_loss": 0.22759371995925903, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "loss": 1.385339617729187, + "step": 1040 + }, + { + "ce_loss": 0.36298808455467224, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "distill_loss": 0.1468626856803894, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "ref_ce_loss": 0.20583710074424744, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "loss": 1.0133357048034668, + "step": 1040 + }, + { + "ce_loss": 0.3378484845161438, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "distill_loss": 0.1804906278848648, + "epoch": 0.3468979319546364, + "step": 1040 + }, + { + "epoch": 0.3468979319546364, + "ref_ce_loss": 0.11686846613883972, + "step": 1040 + }, + { + "epoch": 0.35023348899266177, + "loss": 0.9878, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "grad_norm": 2.773841381072998, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "learning_rate": 0.0002998510816428017, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "loss": 0.8605588674545288, + "step": 1050 + }, + { + "ce_loss": 0.40844660997390747, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "distill_loss": 0.1829022318124771, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "ref_ce_loss": 0.1620185673236847, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "loss": 0.9875406622886658, + "step": 1050 + }, + { + "ce_loss": 0.5963196158409119, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "distill_loss": 0.21429228782653809, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "ref_ce_loss": 0.17668293416500092, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "loss": 0.8259153962135315, + "step": 1050 + }, + { + "ce_loss": 0.36064836382865906, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "distill_loss": 0.17781072854995728, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "ref_ce_loss": 0.18637806177139282, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "loss": 0.7744853496551514, + "step": 1050 + }, + { + "ce_loss": 0.38626617193222046, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "distill_loss": 0.18170593678951263, + "epoch": 0.35023348899266177, + "step": 1050 + }, + { + "epoch": 0.35023348899266177, + "ref_ce_loss": 0.1515505313873291, + "step": 1050 + }, + { + "epoch": 0.3535690460306871, + "loss": 1.0142, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "grad_norm": 2.391183376312256, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "learning_rate": 0.00029984192114960746, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "loss": 0.7195358276367188, + "step": 1060 + }, + { + "ce_loss": 0.3371320962905884, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "distill_loss": 0.23293828964233398, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "ref_ce_loss": 0.08528699725866318, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "loss": 1.3199244737625122, + "step": 1060 + }, + { + "ce_loss": 0.4546003043651581, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "distill_loss": 0.2744484543800354, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "ref_ce_loss": 0.15281927585601807, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "loss": 0.916218101978302, + "step": 1060 + }, + { + "ce_loss": 0.3605182468891144, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "distill_loss": 0.2062976062297821, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "ref_ce_loss": 0.19087523221969604, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "loss": 0.6971317529678345, + "step": 1060 + }, + { + "ce_loss": 0.27850502729415894, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "distill_loss": 0.20633718371391296, + "epoch": 0.3535690460306871, + "step": 1060 + }, + { + "epoch": 0.3535690460306871, + "ref_ce_loss": 0.21209193766117096, + "step": 1060 + }, + { + "epoch": 0.35690460306871247, + "loss": 0.9138, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "grad_norm": 3.1012446880340576, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "learning_rate": 0.0002998324874038568, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "loss": 1.016005277633667, + "step": 1070 + }, + { + "ce_loss": 0.3927364647388458, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "distill_loss": 0.2100256383419037, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "ref_ce_loss": 0.24908369779586792, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "loss": 0.8831821084022522, + "step": 1070 + }, + { + "ce_loss": 0.43767842650413513, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "distill_loss": 0.21271327137947083, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "ref_ce_loss": 0.23278841376304626, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "loss": 0.774807870388031, + "step": 1070 + }, + { + "ce_loss": 0.4057586193084717, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "distill_loss": 0.22247371077537537, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "ref_ce_loss": 0.14649169147014618, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "loss": 1.6678129434585571, + "step": 1070 + }, + { + "ce_loss": 0.4439554512500763, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "distill_loss": 0.22488150000572205, + "epoch": 0.35690460306871247, + "step": 1070 + }, + { + "epoch": 0.35690460306871247, + "ref_ce_loss": 0.23912842571735382, + "step": 1070 + }, + { + "epoch": 0.3602401601067378, + "loss": 1.0485, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "grad_norm": 2.263382911682129, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "learning_rate": 0.00029982278042275327, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "loss": 0.9002568125724792, + "step": 1080 + }, + { + "ce_loss": 0.2690925598144531, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "distill_loss": 0.2799534499645233, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "ref_ce_loss": 0.18060727417469025, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "loss": 0.8647176027297974, + "step": 1080 + }, + { + "ce_loss": 0.301247239112854, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "distill_loss": 0.3146838843822479, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "ref_ce_loss": 0.15903924405574799, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "loss": 0.9686211943626404, + "step": 1080 + }, + { + "ce_loss": 0.38894566893577576, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "distill_loss": 0.3025593161582947, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "ref_ce_loss": 0.18121132254600525, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "loss": 0.8153437376022339, + "step": 1080 + }, + { + "ce_loss": 0.27934902906417847, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "distill_loss": 0.3312188982963562, + "epoch": 0.3602401601067378, + "step": 1080 + }, + { + "epoch": 0.3602401601067378, + "ref_ce_loss": 0.1401250958442688, + "step": 1080 + }, + { + "epoch": 0.36357571714476317, + "loss": 1.0589, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "grad_norm": 2.614572048187256, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "learning_rate": 0.0002998128002239985, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "loss": 0.9504167437553406, + "step": 1090 + }, + { + "ce_loss": 0.40660786628723145, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "distill_loss": 0.24065075814723969, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "ref_ce_loss": 0.21729962527751923, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "loss": 0.8468886613845825, + "step": 1090 + }, + { + "ce_loss": 0.3686285614967346, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "distill_loss": 0.22304001450538635, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "ref_ce_loss": 0.16232386231422424, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "loss": 0.8567818403244019, + "step": 1090 + }, + { + "ce_loss": 0.3550497889518738, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "distill_loss": 0.24365133047103882, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "ref_ce_loss": 0.15234960615634918, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "loss": 0.9920378923416138, + "step": 1090 + }, + { + "ce_loss": 0.40550684928894043, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "distill_loss": 0.23939445614814758, + "epoch": 0.36357571714476317, + "step": 1090 + }, + { + "epoch": 0.36357571714476317, + "ref_ce_loss": 0.25587886571884155, + "step": 1090 + }, + { + "epoch": 0.3669112741827885, + "loss": 0.9507, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "grad_norm": 1.8720557689666748, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "learning_rate": 0.00029980254682579244, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "loss": 0.6526878476142883, + "step": 1100 + }, + { + "ce_loss": 0.3227846920490265, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "distill_loss": 0.11304277926683426, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "ref_ce_loss": 0.21555647253990173, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "loss": 0.9330704212188721, + "step": 1100 + }, + { + "ce_loss": 0.42695215344429016, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "distill_loss": 0.13858823478221893, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "ref_ce_loss": 0.18709157407283783, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "loss": 0.6232830882072449, + "step": 1100 + }, + { + "ce_loss": 0.25201037526130676, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "distill_loss": 0.10899011045694351, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "ref_ce_loss": 0.16215498745441437, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "loss": 0.6160975098609924, + "step": 1100 + }, + { + "ce_loss": 0.24335621297359467, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "distill_loss": 0.1022597998380661, + "epoch": 0.3669112741827885, + "step": 1100 + }, + { + "epoch": 0.3669112741827885, + "ref_ce_loss": 0.12327880412340164, + "step": 1100 + }, + { + "epoch": 0.37024683122081387, + "loss": 0.8546, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "grad_norm": 2.0390474796295166, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "learning_rate": 0.00029979202024683324, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "loss": 0.7502092123031616, + "step": 1110 + }, + { + "ce_loss": 0.3670903742313385, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "distill_loss": 0.11122335493564606, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "ref_ce_loss": 0.18666565418243408, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "loss": 0.7180501222610474, + "step": 1110 + }, + { + "ce_loss": 0.38356834650039673, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "distill_loss": 0.124627023935318, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "ref_ce_loss": 0.2097601294517517, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "loss": 1.266775131225586, + "step": 1110 + }, + { + "ce_loss": 0.36160174012184143, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "distill_loss": 0.11643195152282715, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "ref_ce_loss": 0.15776577591896057, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "loss": 0.6577326059341431, + "step": 1110 + }, + { + "ce_loss": 0.3060700297355652, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "distill_loss": 0.11561097204685211, + "epoch": 0.37024683122081387, + "step": 1110 + }, + { + "epoch": 0.37024683122081387, + "ref_ce_loss": 0.1511240005493164, + "step": 1110 + }, + { + "epoch": 0.3735823882588392, + "loss": 1.115, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "grad_norm": 2.075615167617798, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "learning_rate": 0.00029978122050631725, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "loss": 0.7357965111732483, + "step": 1120 + }, + { + "ce_loss": 0.33645859360694885, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "distill_loss": 0.10484284907579422, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "ref_ce_loss": 0.14064335823059082, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "loss": 0.5798081755638123, + "step": 1120 + }, + { + "ce_loss": 0.2527194023132324, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "distill_loss": 0.08029267936944962, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "ref_ce_loss": 0.17915977537631989, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "loss": 0.9990452527999878, + "step": 1120 + }, + { + "ce_loss": 0.391522616147995, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "distill_loss": 0.10954640805721283, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "ref_ce_loss": 0.19175396859645844, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "loss": 0.7248374819755554, + "step": 1120 + }, + { + "ce_loss": 0.41094255447387695, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "distill_loss": 0.11109843105077744, + "epoch": 0.3735823882588392, + "step": 1120 + }, + { + "epoch": 0.3735823882588392, + "ref_ce_loss": 0.2024851143360138, + "step": 1120 + }, + { + "epoch": 0.37691794529686456, + "loss": 0.8209, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "grad_norm": 1.610378623008728, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "learning_rate": 0.00029977014762393894, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "loss": 1.7917901277542114, + "step": 1130 + }, + { + "ce_loss": 0.41128435730934143, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "distill_loss": 0.10002844035625458, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "ref_ce_loss": 0.19286653399467468, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "loss": 0.7495785355567932, + "step": 1130 + }, + { + "ce_loss": 0.3824373483657837, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "distill_loss": 0.09886406362056732, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "ref_ce_loss": 0.18395927548408508, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "loss": 0.9176914691925049, + "step": 1130 + }, + { + "ce_loss": 0.3707192540168762, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "distill_loss": 0.08182831108570099, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "ref_ce_loss": 0.18617984652519226, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "loss": 1.0768955945968628, + "step": 1130 + }, + { + "ce_loss": 0.32718828320503235, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "distill_loss": 0.09349283576011658, + "epoch": 0.37691794529686456, + "step": 1130 + }, + { + "epoch": 0.37691794529686456, + "ref_ce_loss": 0.12686288356781006, + "step": 1130 + }, + { + "epoch": 0.3802535023348899, + "loss": 0.9497, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "grad_norm": 3.2042505741119385, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "learning_rate": 0.0002997588016198908, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "loss": 0.9679973125457764, + "step": 1140 + }, + { + "ce_loss": 0.4296916723251343, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "distill_loss": 0.10788857191801071, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "ref_ce_loss": 0.250051885843277, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "loss": 0.7972154021263123, + "step": 1140 + }, + { + "ce_loss": 0.41705816984176636, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "distill_loss": 0.12142591923475266, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "ref_ce_loss": 0.18019571900367737, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "loss": 0.8467376232147217, + "step": 1140 + }, + { + "ce_loss": 0.36607176065444946, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "distill_loss": 0.10272481292486191, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "ref_ce_loss": 0.1918632835149765, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "loss": 0.722500205039978, + "step": 1140 + }, + { + "ce_loss": 0.3441055715084076, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "distill_loss": 0.12058152258396149, + "epoch": 0.3802535023348899, + "step": 1140 + }, + { + "epoch": 0.3802535023348899, + "ref_ce_loss": 0.1723051369190216, + "step": 1140 + }, + { + "epoch": 0.38358905937291526, + "loss": 0.8633, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "grad_norm": 1.7969938516616821, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "learning_rate": 0.00029974718251486363, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "loss": 1.4030652046203613, + "step": 1150 + }, + { + "ce_loss": 0.33470800518989563, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "distill_loss": 0.0937473401427269, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "ref_ce_loss": 0.15464939177036285, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "loss": 0.7301994562149048, + "step": 1150 + }, + { + "ce_loss": 0.3315107822418213, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "distill_loss": 0.09801779687404633, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "ref_ce_loss": 0.17848145961761475, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "loss": 0.8190576434135437, + "step": 1150 + }, + { + "ce_loss": 0.3555756211280823, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "distill_loss": 0.09977540373802185, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "ref_ce_loss": 0.1742119938135147, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "loss": 0.6100097298622131, + "step": 1150 + }, + { + "ce_loss": 0.27171769738197327, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "distill_loss": 0.09016477316617966, + "epoch": 0.38358905937291526, + "step": 1150 + }, + { + "epoch": 0.38358905937291526, + "ref_ce_loss": 0.1305205374956131, + "step": 1150 + }, + { + "epoch": 0.3869246164109406, + "loss": 0.9321, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "grad_norm": 2.3243284225463867, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "learning_rate": 0.0002997352903300459, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "loss": 0.8214346170425415, + "step": 1160 + }, + { + "ce_loss": 0.349997878074646, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "distill_loss": 0.08020985126495361, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "ref_ce_loss": 0.14636772871017456, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "loss": 0.7112948894500732, + "step": 1160 + }, + { + "ce_loss": 0.4055274426937103, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "distill_loss": 0.07691830396652222, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "ref_ce_loss": 0.22505919635295868, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "loss": 1.0252524614334106, + "step": 1160 + }, + { + "ce_loss": 0.4234050512313843, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "distill_loss": 0.10572229325771332, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "ref_ce_loss": 0.19568488001823425, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "loss": 1.1514942646026611, + "step": 1160 + }, + { + "ce_loss": 0.38408973813056946, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "distill_loss": 0.09920583665370941, + "epoch": 0.3869246164109406, + "step": 1160 + }, + { + "epoch": 0.3869246164109406, + "ref_ce_loss": 0.1944963037967682, + "step": 1160 + }, + { + "epoch": 0.39026017344896596, + "loss": 0.9385, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "grad_norm": 2.040172576904297, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "learning_rate": 0.0002997231250871244, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "loss": 1.2399312257766724, + "step": 1170 + }, + { + "ce_loss": 0.3100973069667816, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "distill_loss": 0.11440251022577286, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "ref_ce_loss": 0.14282414317131042, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "loss": 0.8066614866256714, + "step": 1170 + }, + { + "ce_loss": 0.29232895374298096, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "distill_loss": 0.09082722663879395, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "ref_ce_loss": 0.15069840848445892, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "loss": 0.6730996370315552, + "step": 1170 + }, + { + "ce_loss": 0.30601993203163147, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "distill_loss": 0.11863667517900467, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "ref_ce_loss": 0.14708290994167328, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "loss": 2.0505471229553223, + "step": 1170 + }, + { + "ce_loss": 0.3397879898548126, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "distill_loss": 0.11590779572725296, + "epoch": 0.39026017344896596, + "step": 1170 + }, + { + "epoch": 0.39026017344896596, + "ref_ce_loss": 0.18287134170532227, + "step": 1170 + }, + { + "epoch": 0.3935957304869913, + "loss": 0.9995, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "grad_norm": 3.7084498405456543, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "learning_rate": 0.0002997106868082837, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "loss": 0.9556168913841248, + "step": 1180 + }, + { + "ce_loss": 0.3851969540119171, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "distill_loss": 0.22958514094352722, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "ref_ce_loss": 0.20239123702049255, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "loss": 0.8375714421272278, + "step": 1180 + }, + { + "ce_loss": 0.30635103583335876, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "distill_loss": 0.19275817275047302, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "ref_ce_loss": 0.21164017915725708, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "loss": 0.780803918838501, + "step": 1180 + }, + { + "ce_loss": 0.36288484930992126, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "distill_loss": 0.2200234830379486, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "ref_ce_loss": 0.19749197363853455, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "loss": 0.9087613821029663, + "step": 1180 + }, + { + "ce_loss": 0.2523532211780548, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "distill_loss": 0.1913367360830307, + "epoch": 0.3935957304869913, + "step": 1180 + }, + { + "epoch": 0.3935957304869913, + "ref_ce_loss": 0.1895836889743805, + "step": 1180 + }, + { + "epoch": 0.39693128752501666, + "loss": 0.9122, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "grad_norm": 2.2110822200775146, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "learning_rate": 0.0002996979755162063, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "loss": 1.271198034286499, + "step": 1190 + }, + { + "ce_loss": 0.2786172032356262, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "distill_loss": 0.15106816589832306, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "ref_ce_loss": 0.13261818885803223, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "loss": 0.7029586434364319, + "step": 1190 + }, + { + "ce_loss": 0.32034623622894287, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "distill_loss": 0.1312541663646698, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "ref_ce_loss": 0.15258146822452545, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "loss": 0.7901096940040588, + "step": 1190 + }, + { + "ce_loss": 0.37307074666023254, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "distill_loss": 0.13851478695869446, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "ref_ce_loss": 0.16212794184684753, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "loss": 1.0396982431411743, + "step": 1190 + }, + { + "ce_loss": 0.4234752357006073, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "distill_loss": 0.15358714759349823, + "epoch": 0.39693128752501666, + "step": 1190 + }, + { + "epoch": 0.39693128752501666, + "ref_ce_loss": 0.22049111127853394, + "step": 1190 + }, + { + "epoch": 0.400266844563042, + "loss": 0.8564, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "grad_norm": 2.3212337493896484, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "learning_rate": 0.00029968499123407267, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "loss": 0.5984683632850647, + "step": 1200 + }, + { + "ce_loss": 0.3177834451198578, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "distill_loss": 0.09032616019248962, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "ref_ce_loss": 0.12230977416038513, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "loss": 1.0842852592468262, + "step": 1200 + }, + { + "ce_loss": 0.28705641627311707, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "distill_loss": 0.08843199163675308, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "ref_ce_loss": 0.15939076244831085, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "loss": 0.961142897605896, + "step": 1200 + }, + { + "ce_loss": 0.33942434191703796, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "distill_loss": 0.0857396274805069, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "ref_ce_loss": 0.23185887932777405, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "loss": 0.9665995240211487, + "step": 1200 + }, + { + "ce_loss": 0.27184587717056274, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "distill_loss": 0.0773671567440033, + "epoch": 0.400266844563042, + "step": 1200 + }, + { + "epoch": 0.400266844563042, + "ref_ce_loss": 0.22190047800540924, + "step": 1200 + }, + { + "epoch": 0.40360240160106736, + "loss": 0.8723, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "grad_norm": 2.192169427871704, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "learning_rate": 0.00029967173398556086, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "loss": 0.7515622973442078, + "step": 1210 + }, + { + "ce_loss": 0.3157145380973816, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "distill_loss": 0.094798743724823, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "ref_ce_loss": 0.23686806857585907, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "loss": 0.7274072170257568, + "step": 1210 + }, + { + "ce_loss": 0.3264521658420563, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "distill_loss": 0.09107182919979095, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "ref_ce_loss": 0.1764145791530609, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "loss": 1.4342138767242432, + "step": 1210 + }, + { + "ce_loss": 0.3195574879646301, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "distill_loss": 0.07628357410430908, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "ref_ce_loss": 0.15575119853019714, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "loss": 0.5367107391357422, + "step": 1210 + }, + { + "ce_loss": 0.2918946444988251, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "distill_loss": 0.08166878670454025, + "epoch": 0.40360240160106736, + "step": 1210 + }, + { + "epoch": 0.40360240160106736, + "ref_ce_loss": 0.16308514773845673, + "step": 1210 + }, + { + "epoch": 0.4069379586390927, + "loss": 0.9943, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "grad_norm": 3.139317512512207, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "learning_rate": 0.00029965820379484695, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "loss": 1.1364343166351318, + "step": 1220 + }, + { + "ce_loss": 0.37319913506507874, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "distill_loss": 0.09696052223443985, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "ref_ce_loss": 0.18106557428836823, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "loss": 0.905819833278656, + "step": 1220 + }, + { + "ce_loss": 0.4253186881542206, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "distill_loss": 0.08640832453966141, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "ref_ce_loss": 0.2814835011959076, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "loss": 0.6299371123313904, + "step": 1220 + }, + { + "ce_loss": 0.3122254014015198, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "distill_loss": 0.08051759749650955, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "ref_ce_loss": 0.23709669709205627, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "loss": 0.8787524700164795, + "step": 1220 + }, + { + "ce_loss": 0.30214032530784607, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "distill_loss": 0.08182747662067413, + "epoch": 0.4069379586390927, + "step": 1220 + }, + { + "epoch": 0.4069379586390927, + "ref_ce_loss": 0.1771172732114792, + "step": 1220 + }, + { + "epoch": 0.41027351567711806, + "loss": 0.8505, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "grad_norm": 2.6399848461151123, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "learning_rate": 0.00029964440068660467, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "loss": 0.881452739238739, + "step": 1230 + }, + { + "ce_loss": 0.4639228284358978, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "distill_loss": 0.16353951394557953, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "ref_ce_loss": 0.18111327290534973, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "loss": 1.0701333284378052, + "step": 1230 + }, + { + "ce_loss": 0.5111604928970337, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "distill_loss": 0.14016252756118774, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "ref_ce_loss": 0.2831771671772003, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "loss": 0.7755110263824463, + "step": 1230 + }, + { + "ce_loss": 0.3701687753200531, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "distill_loss": 0.13138516247272491, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "ref_ce_loss": 0.2043834626674652, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "loss": 0.90342116355896, + "step": 1230 + }, + { + "ce_loss": 0.24989053606987, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "distill_loss": 0.14895430207252502, + "epoch": 0.41027351567711806, + "step": 1230 + }, + { + "epoch": 0.41027351567711806, + "ref_ce_loss": 0.1435110718011856, + "step": 1230 + }, + { + "epoch": 0.4136090727151434, + "loss": 1.8225, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "grad_norm": 5.425915718078613, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "learning_rate": 0.0002996303246860054, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "loss": 1.3830713033676147, + "step": 1240 + }, + { + "ce_loss": 0.3072783350944519, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "distill_loss": 0.7292137145996094, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "ref_ce_loss": 0.1486404836177826, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "loss": 1.6045691967010498, + "step": 1240 + }, + { + "ce_loss": 0.4130804240703583, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "distill_loss": 0.8139525651931763, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "ref_ce_loss": 0.19312764704227448, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "loss": 2.623363733291626, + "step": 1240 + }, + { + "ce_loss": 0.3971395194530487, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "distill_loss": 0.8659202456474304, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "ref_ce_loss": 0.20216262340545654, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "loss": 1.5429506301879883, + "step": 1240 + }, + { + "ce_loss": 0.2974897623062134, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "distill_loss": 0.8454087376594543, + "epoch": 0.4136090727151434, + "step": 1240 + }, + { + "epoch": 0.4136090727151434, + "ref_ce_loss": 0.15195105969905853, + "step": 1240 + }, + { + "epoch": 0.41694462975316876, + "loss": 1.2865, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "grad_norm": 2.501133918762207, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "learning_rate": 0.0002996159758187183, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "loss": 1.0447216033935547, + "step": 1250 + }, + { + "ce_loss": 0.4286389946937561, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "distill_loss": 0.3530900478363037, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "ref_ce_loss": 0.17672403156757355, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "loss": 1.8678219318389893, + "step": 1250 + }, + { + "ce_loss": 0.3537362217903137, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "distill_loss": 0.32580870389938354, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "ref_ce_loss": 0.23289765417575836, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "loss": 1.0801249742507935, + "step": 1250 + }, + { + "ce_loss": 0.3455648422241211, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "distill_loss": 0.33082038164138794, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "ref_ce_loss": 0.21357986330986023, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "loss": 1.061863899230957, + "step": 1250 + }, + { + "ce_loss": 0.3110015094280243, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "distill_loss": 0.29846805334091187, + "epoch": 0.41694462975316876, + "step": 1250 + }, + { + "epoch": 0.41694462975316876, + "ref_ce_loss": 0.20160391926765442, + "step": 1250 + }, + { + "epoch": 0.4202801867911941, + "loss": 1.0603, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "grad_norm": 4.449580669403076, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "learning_rate": 0.00029960135411090995, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "loss": 0.6296399831771851, + "step": 1260 + }, + { + "ce_loss": 0.2865029573440552, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "distill_loss": 0.1852218210697174, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "ref_ce_loss": 0.09856808930635452, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "loss": 0.9737272262573242, + "step": 1260 + }, + { + "ce_loss": 0.33188915252685547, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "distill_loss": 0.19780108332633972, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "ref_ce_loss": 0.1794106811285019, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "loss": 0.934063732624054, + "step": 1260 + }, + { + "ce_loss": 0.44743072986602783, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "distill_loss": 0.231248140335083, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "ref_ce_loss": 0.25490811467170715, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "loss": 0.9184874892234802, + "step": 1260 + }, + { + "ce_loss": 0.36027130484580994, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "distill_loss": 0.24745814502239227, + "epoch": 0.4202801867911941, + "step": 1260 + }, + { + "epoch": 0.4202801867911941, + "ref_ce_loss": 0.2323707938194275, + "step": 1260 + }, + { + "epoch": 0.42361574382921946, + "loss": 1.0, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "grad_norm": 4.113201141357422, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "learning_rate": 0.00029958645958924466, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "loss": 0.6826831102371216, + "step": 1270 + }, + { + "ce_loss": 0.3329589068889618, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "distill_loss": 0.19312255084514618, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "ref_ce_loss": 0.15657509863376617, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "loss": 0.7136074304580688, + "step": 1270 + }, + { + "ce_loss": 0.28988170623779297, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "distill_loss": 0.16940924525260925, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "ref_ce_loss": 0.1954113394021988, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "loss": 0.6761580109596252, + "step": 1270 + }, + { + "ce_loss": 0.33831357955932617, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "distill_loss": 0.18440185487270355, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "ref_ce_loss": 0.15340223908424377, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "loss": 0.7417470216751099, + "step": 1270 + }, + { + "ce_loss": 0.31187084317207336, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "distill_loss": 0.2230009287595749, + "epoch": 0.42361574382921946, + "step": 1270 + }, + { + "epoch": 0.42361574382921946, + "ref_ce_loss": 0.11719028651714325, + "step": 1270 + }, + { + "epoch": 0.4269513008672448, + "loss": 0.9325, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "grad_norm": 2.9946320056915283, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "learning_rate": 0.0002995712922808841, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "loss": 1.672420859336853, + "step": 1280 + }, + { + "ce_loss": 0.3916040062904358, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "distill_loss": 0.21645702421665192, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "ref_ce_loss": 0.22933925688266754, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "loss": 0.93517005443573, + "step": 1280 + }, + { + "ce_loss": 0.40770724415779114, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "distill_loss": 0.25952303409576416, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "ref_ce_loss": 0.22096198797225952, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "loss": 0.9371185302734375, + "step": 1280 + }, + { + "ce_loss": 0.36753150820732117, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "distill_loss": 0.33493292331695557, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "ref_ce_loss": 0.15782517194747925, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "loss": 0.7511712312698364, + "step": 1280 + }, + { + "ce_loss": 0.25156018137931824, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "distill_loss": 0.26513832807540894, + "epoch": 0.4269513008672448, + "step": 1280 + }, + { + "epoch": 0.4269513008672448, + "ref_ce_loss": 0.140127494931221, + "step": 1280 + }, + { + "epoch": 0.43028685790527016, + "loss": 0.8839, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "grad_norm": 5.134028434753418, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "learning_rate": 0.0002995558522134875, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "loss": 0.7763099670410156, + "step": 1290 + }, + { + "ce_loss": 0.3329162895679474, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "distill_loss": 0.2020120471715927, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "ref_ce_loss": 0.15234671533107758, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "loss": 1.1825041770935059, + "step": 1290 + }, + { + "ce_loss": 0.346752792596817, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "distill_loss": 0.21200241148471832, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "ref_ce_loss": 0.16237637400627136, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "loss": 0.6205200552940369, + "step": 1290 + }, + { + "ce_loss": 0.280308336019516, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "distill_loss": 0.20304308831691742, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "ref_ce_loss": 0.1370389461517334, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "loss": 0.9143213033676147, + "step": 1290 + }, + { + "ce_loss": 0.38608551025390625, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "distill_loss": 0.18370597064495087, + "epoch": 0.43028685790527016, + "step": 1290 + }, + { + "epoch": 0.43028685790527016, + "ref_ce_loss": 0.14932240545749664, + "step": 1290 + }, + { + "epoch": 0.4336224149432955, + "loss": 1.2536, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "grad_norm": 83.33971405029297, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "learning_rate": 0.0002995401394152114, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "loss": 1.5523114204406738, + "step": 1300 + }, + { + "ce_loss": 0.39939671754837036, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "distill_loss": 0.5877339243888855, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "ref_ce_loss": 0.16907824575901031, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "loss": 1.2784559726715088, + "step": 1300 + }, + { + "ce_loss": 0.32869669795036316, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "distill_loss": 0.5456231832504272, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "ref_ce_loss": 0.16758182644844055, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "loss": 1.1197588443756104, + "step": 1300 + }, + { + "ce_loss": 0.3304974436759949, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "distill_loss": 0.5112226605415344, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "ref_ce_loss": 0.16701827943325043, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "loss": 1.211663842201233, + "step": 1300 + }, + { + "ce_loss": 0.322611004114151, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "distill_loss": 0.6028107404708862, + "epoch": 0.4336224149432955, + "step": 1300 + }, + { + "epoch": 0.4336224149432955, + "ref_ce_loss": 0.2120797336101532, + "step": 1300 + }, + { + "epoch": 0.43695797198132086, + "loss": 1.0281, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "grad_norm": 2.6226251125335693, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "learning_rate": 0.00029952415391470977, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "loss": 1.3600938320159912, + "step": 1310 + }, + { + "ce_loss": 0.42805051803588867, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "distill_loss": 0.21402305364608765, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "ref_ce_loss": 0.16621027886867523, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "loss": 0.971459686756134, + "step": 1310 + }, + { + "ce_loss": 0.32620683312416077, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "distill_loss": 0.18102744221687317, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "ref_ce_loss": 0.19330710172653198, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "loss": 1.3167970180511475, + "step": 1310 + }, + { + "ce_loss": 0.4114319980144501, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "distill_loss": 0.22762969136238098, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "ref_ce_loss": 0.16943278908729553, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "loss": 0.8594940900802612, + "step": 1310 + }, + { + "ce_loss": 0.36697542667388916, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "distill_loss": 0.2220364212989807, + "epoch": 0.43695797198132086, + "step": 1310 + }, + { + "epoch": 0.43695797198132086, + "ref_ce_loss": 0.15329211950302124, + "step": 1310 + }, + { + "epoch": 0.4402935290193462, + "loss": 0.8734, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "grad_norm": 2.488173723220825, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "learning_rate": 0.0002995078957411339, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "loss": 0.8020517230033875, + "step": 1320 + }, + { + "ce_loss": 0.3077715039253235, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "distill_loss": 0.13831500709056854, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "ref_ce_loss": 0.18166959285736084, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "loss": 1.254194736480713, + "step": 1320 + }, + { + "ce_loss": 0.3027229309082031, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "distill_loss": 0.1362319141626358, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "ref_ce_loss": 0.20650614798069, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "loss": 0.5869837999343872, + "step": 1320 + }, + { + "ce_loss": 0.2980230748653412, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "distill_loss": 0.1261177659034729, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "ref_ce_loss": 0.1628243774175644, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "loss": 0.789718508720398, + "step": 1320 + }, + { + "ce_loss": 0.40123918652534485, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "distill_loss": 0.15148304402828217, + "epoch": 0.4402935290193462, + "step": 1320 + }, + { + "epoch": 0.4402935290193462, + "ref_ce_loss": 0.16462190449237823, + "step": 1320 + }, + { + "epoch": 0.44362908605737156, + "loss": 0.7744, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "grad_norm": 1.8618627786636353, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "learning_rate": 0.00029949136492413224, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "loss": 0.5880228281021118, + "step": 1330 + }, + { + "ce_loss": 0.27567267417907715, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "distill_loss": 0.09483489394187927, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "ref_ce_loss": 0.18227918446063995, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "loss": 0.8252865076065063, + "step": 1330 + }, + { + "ce_loss": 0.2724820077419281, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "distill_loss": 0.0967143326997757, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "ref_ce_loss": 0.18324853479862213, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "loss": 0.7001053094863892, + "step": 1330 + }, + { + "ce_loss": 0.2155034989118576, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "distill_loss": 0.11413068324327469, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "ref_ce_loss": 0.1161014661192894, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "loss": 0.8354309797286987, + "step": 1330 + }, + { + "ce_loss": 0.36032605171203613, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "distill_loss": 0.09894613176584244, + "epoch": 0.44362908605737156, + "step": 1330 + }, + { + "epoch": 0.44362908605737156, + "ref_ce_loss": 0.22084367275238037, + "step": 1330 + }, + { + "epoch": 0.4469646430953969, + "loss": 0.9264, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "grad_norm": 3.5847744941711426, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "learning_rate": 0.0002994745614938505, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "loss": 2.41884708404541, + "step": 1340 + }, + { + "ce_loss": 0.35969698429107666, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "distill_loss": 0.13992217183113098, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "ref_ce_loss": 0.1331671178340912, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "loss": 0.6663955450057983, + "step": 1340 + }, + { + "ce_loss": 0.26573446393013, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "distill_loss": 0.12028539180755615, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "ref_ce_loss": 0.14755044877529144, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "loss": 0.7013957500457764, + "step": 1340 + }, + { + "ce_loss": 0.3335946798324585, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "distill_loss": 0.10393598675727844, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "ref_ce_loss": 0.211795374751091, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "loss": 0.5281260013580322, + "step": 1340 + }, + { + "ce_loss": 0.23023775219917297, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "distill_loss": 0.10416372120380402, + "epoch": 0.4469646430953969, + "step": 1340 + }, + { + "epoch": 0.4469646430953969, + "ref_ce_loss": 0.10854792594909668, + "step": 1340 + }, + { + "epoch": 0.45030020013342226, + "loss": 0.8841, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "grad_norm": 2.1285672187805176, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "learning_rate": 0.0002994574854809315, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "loss": 0.52470463514328, + "step": 1350 + }, + { + "ce_loss": 0.2516459822654724, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "distill_loss": 0.14257560670375824, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "ref_ce_loss": 0.07917644828557968, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "loss": 0.7037546634674072, + "step": 1350 + }, + { + "ce_loss": 0.31851497292518616, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "distill_loss": 0.12531189620494843, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "ref_ce_loss": 0.18432098627090454, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "loss": 0.5914328098297119, + "step": 1350 + }, + { + "ce_loss": 0.27234095335006714, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "distill_loss": 0.11418743431568146, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "ref_ce_loss": 0.20483791828155518, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "loss": 0.6631147265434265, + "step": 1350 + }, + { + "ce_loss": 0.2455311268568039, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "distill_loss": 0.13238249719142914, + "epoch": 0.45030020013342226, + "step": 1350 + }, + { + "epoch": 0.45030020013342226, + "ref_ce_loss": 0.18045808374881744, + "step": 1350 + }, + { + "epoch": 0.4536357571714476, + "loss": 0.7771, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "grad_norm": 2.4119577407836914, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "learning_rate": 0.0002994401369165151, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "loss": 0.6945372819900513, + "step": 1360 + }, + { + "ce_loss": 0.32869598269462585, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "distill_loss": 0.1271277368068695, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "ref_ce_loss": 0.23853451013565063, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "loss": 0.8358821868896484, + "step": 1360 + }, + { + "ce_loss": 0.26175400614738464, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "distill_loss": 0.1243264302611351, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "ref_ce_loss": 0.2343246340751648, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "loss": 0.7749500274658203, + "step": 1360 + }, + { + "ce_loss": 0.330771267414093, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "distill_loss": 0.14768579602241516, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "ref_ce_loss": 0.15305696427822113, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "loss": 0.9154521226882935, + "step": 1360 + }, + { + "ce_loss": 0.3262263238430023, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "distill_loss": 0.1562902182340622, + "epoch": 0.4536357571714476, + "step": 1360 + }, + { + "epoch": 0.4536357571714476, + "ref_ce_loss": 0.158981591463089, + "step": 1360 + }, + { + "epoch": 0.45697131420947296, + "loss": 0.8381, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "grad_norm": 2.2991511821746826, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "learning_rate": 0.00029942251583223834, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "loss": 0.802588939666748, + "step": 1370 + }, + { + "ce_loss": 0.3817768692970276, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "distill_loss": 0.15474967658519745, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "ref_ce_loss": 0.21179994940757751, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "loss": 0.5642900466918945, + "step": 1370 + }, + { + "ce_loss": 0.22733645141124725, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "distill_loss": 0.1353457272052765, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "ref_ce_loss": 0.12604719400405884, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "loss": 0.9796323776245117, + "step": 1370 + }, + { + "ce_loss": 0.3570672273635864, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "distill_loss": 0.1766320914030075, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "ref_ce_loss": 0.12003152817487717, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "loss": 0.8954633474349976, + "step": 1370 + }, + { + "ce_loss": 0.34348300099372864, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "distill_loss": 0.17705875635147095, + "epoch": 0.45697131420947296, + "step": 1370 + }, + { + "epoch": 0.45697131420947296, + "ref_ce_loss": 0.26571720838546753, + "step": 1370 + }, + { + "epoch": 0.4603068712474983, + "loss": 0.8116, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "grad_norm": 2.4742720127105713, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "learning_rate": 0.00029940462226023506, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "loss": 0.640617311000824, + "step": 1380 + }, + { + "ce_loss": 0.24786534905433655, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "distill_loss": 0.15409091114997864, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "ref_ce_loss": 0.12548106908798218, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "loss": 1.1944732666015625, + "step": 1380 + }, + { + "ce_loss": 0.33839792013168335, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "distill_loss": 0.14373010396957397, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "ref_ce_loss": 0.20675034821033478, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "loss": 0.8365032076835632, + "step": 1380 + }, + { + "ce_loss": 0.34791073203086853, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "distill_loss": 0.16508665680885315, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "ref_ce_loss": 0.2095586210489273, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "loss": 0.9003428816795349, + "step": 1380 + }, + { + "ce_loss": 0.3127005398273468, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "distill_loss": 0.15318584442138672, + "epoch": 0.4603068712474983, + "step": 1380 + }, + { + "epoch": 0.4603068712474983, + "ref_ce_loss": 0.15895813703536987, + "step": 1380 + }, + { + "epoch": 0.46364242828552366, + "loss": 0.8893, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "grad_norm": 2.6648497581481934, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "learning_rate": 0.0002993864562331361, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "loss": 0.525078535079956, + "step": 1390 + }, + { + "ce_loss": 0.25243738293647766, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "distill_loss": 0.1383885145187378, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "ref_ce_loss": 0.13377568125724792, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "loss": 1.248044729232788, + "step": 1390 + }, + { + "ce_loss": 0.4142797887325287, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "distill_loss": 0.21165764331817627, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "ref_ce_loss": 0.22711358964443207, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "loss": 1.7012591361999512, + "step": 1390 + }, + { + "ce_loss": 0.25872302055358887, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "distill_loss": 0.18654128909111023, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "ref_ce_loss": 0.1603059321641922, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "loss": 1.113049030303955, + "step": 1390 + }, + { + "ce_loss": 0.33455508947372437, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "distill_loss": 0.14230862259864807, + "epoch": 0.46364242828552366, + "step": 1390 + }, + { + "epoch": 0.46364242828552366, + "ref_ce_loss": 0.14293868839740753, + "step": 1390 + }, + { + "epoch": 0.466977985323549, + "loss": 0.9262, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "grad_norm": 4.262118339538574, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "learning_rate": 0.0002993680177840691, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "loss": 0.8223698139190674, + "step": 1400 + }, + { + "ce_loss": 0.3486742377281189, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "distill_loss": 0.16176161170005798, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "ref_ce_loss": 0.18248051404953003, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "loss": 0.9195911884307861, + "step": 1400 + }, + { + "ce_loss": 0.34413450956344604, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "distill_loss": 0.14552196860313416, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "ref_ce_loss": 0.22156870365142822, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "loss": 0.8011733293533325, + "step": 1400 + }, + { + "ce_loss": 0.26975682377815247, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "distill_loss": 0.14448806643486023, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "ref_ce_loss": 0.1277724802494049, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "loss": 0.7647350430488586, + "step": 1400 + }, + { + "ce_loss": 0.3322110176086426, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "distill_loss": 0.15847086906433105, + "epoch": 0.466977985323549, + "step": 1400 + }, + { + "epoch": 0.466977985323549, + "ref_ce_loss": 0.18618299067020416, + "step": 1400 + }, + { + "epoch": 0.4703135423615744, + "loss": 0.8987, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "grad_norm": 4.312910079956055, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "learning_rate": 0.00029934930694665854, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "loss": 0.7264111638069153, + "step": 1410 + }, + { + "ce_loss": 0.33520573377609253, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "distill_loss": 0.2195453941822052, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "ref_ce_loss": 0.17120620608329773, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "loss": 0.6752222180366516, + "step": 1410 + }, + { + "ce_loss": 0.3040088713169098, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "distill_loss": 0.15849845111370087, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "ref_ce_loss": 0.21257616579532623, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "loss": 0.7416223287582397, + "step": 1410 + }, + { + "ce_loss": 0.32344698905944824, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "distill_loss": 0.20644345879554749, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "ref_ce_loss": 0.14413928985595703, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "loss": 1.558759331703186, + "step": 1410 + }, + { + "ce_loss": 0.34111708402633667, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "distill_loss": 0.1806751787662506, + "epoch": 0.4703135423615744, + "step": 1410 + }, + { + "epoch": 0.4703135423615744, + "ref_ce_loss": 0.2201414853334427, + "step": 1410 + }, + { + "epoch": 0.47364909939959976, + "loss": 0.8553, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "grad_norm": 3.414496898651123, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "learning_rate": 0.0002993303237550256, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "loss": 1.673762559890747, + "step": 1420 + }, + { + "ce_loss": 0.3085290789604187, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "distill_loss": 0.16244950890541077, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "ref_ce_loss": 0.24353231489658356, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "loss": 0.9255890846252441, + "step": 1420 + }, + { + "ce_loss": 0.40204712748527527, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "distill_loss": 0.15355859696865082, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "ref_ce_loss": 0.13155770301818848, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "loss": 0.9814118146896362, + "step": 1420 + }, + { + "ce_loss": 0.4489750266075134, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "distill_loss": 0.17946653068065643, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "ref_ce_loss": 0.2264609932899475, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "loss": 1.0083775520324707, + "step": 1420 + }, + { + "ce_loss": 0.3463815450668335, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "distill_loss": 0.16612856090068817, + "epoch": 0.47364909939959976, + "step": 1420 + }, + { + "epoch": 0.47364909939959976, + "ref_ce_loss": 0.2110508233308792, + "step": 1420 + }, + { + "epoch": 0.4769846564376251, + "loss": 0.9051, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "grad_norm": 2.172429084777832, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "learning_rate": 0.00029931106824378814, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "loss": 0.6023657321929932, + "step": 1430 + }, + { + "ce_loss": 0.33847174048423767, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "distill_loss": 0.125957190990448, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "ref_ce_loss": 0.136063352227211, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "loss": 0.745324969291687, + "step": 1430 + }, + { + "ce_loss": 0.3038365840911865, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "distill_loss": 0.11901705712080002, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "ref_ce_loss": 0.16039136052131653, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "loss": 0.685943067073822, + "step": 1430 + }, + { + "ce_loss": 0.2750985026359558, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "distill_loss": 0.11924824863672256, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "ref_ce_loss": 0.18697425723075867, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "loss": 1.4483582973480225, + "step": 1430 + }, + { + "ce_loss": 0.3882851302623749, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "distill_loss": 0.12825100123882294, + "epoch": 0.4769846564376251, + "step": 1430 + }, + { + "epoch": 0.4769846564376251, + "ref_ce_loss": 0.2077254205942154, + "step": 1430 + }, + { + "epoch": 0.48032021347565046, + "loss": 0.7953, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "grad_norm": 2.2697198390960693, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "learning_rate": 0.00029929154044806063, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "loss": 0.8644044995307922, + "step": 1440 + }, + { + "ce_loss": 0.291441947221756, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "distill_loss": 0.11410816758871078, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "ref_ce_loss": 0.12129537761211395, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "loss": 1.0806946754455566, + "step": 1440 + }, + { + "ce_loss": 0.29708412289619446, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "distill_loss": 0.1344713270664215, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "ref_ce_loss": 0.1404844969511032, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "loss": 0.6967662572860718, + "step": 1440 + }, + { + "ce_loss": 0.31594088673591614, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "distill_loss": 0.10365678369998932, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "ref_ce_loss": 0.19828706979751587, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "loss": 0.8506115674972534, + "step": 1440 + }, + { + "ce_loss": 0.26143166422843933, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "distill_loss": 0.11792758107185364, + "epoch": 0.48032021347565046, + "step": 1440 + }, + { + "epoch": 0.48032021347565046, + "ref_ce_loss": 0.1731175184249878, + "step": 1440 + }, + { + "epoch": 0.4836557705136758, + "loss": 0.8059, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "grad_norm": 2.886183738708496, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "learning_rate": 0.00029927174040345403, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "loss": 0.8864375352859497, + "step": 1450 + }, + { + "ce_loss": 0.3058259189128876, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "distill_loss": 0.11227105557918549, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "ref_ce_loss": 0.21765603125095367, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "loss": 0.499306857585907, + "step": 1450 + }, + { + "ce_loss": 0.2557595372200012, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "distill_loss": 0.10255733132362366, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "ref_ce_loss": 0.14077462255954742, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "loss": 1.0722293853759766, + "step": 1450 + }, + { + "ce_loss": 0.47057244181632996, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "distill_loss": 0.14682574570178986, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "ref_ce_loss": 0.19197282195091248, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "loss": 0.6186968088150024, + "step": 1450 + }, + { + "ce_loss": 0.29221686720848083, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "distill_loss": 0.1100810170173645, + "epoch": 0.4836557705136758, + "step": 1450 + }, + { + "epoch": 0.4836557705136758, + "ref_ce_loss": 0.15563970804214478, + "step": 1450 + }, + { + "epoch": 0.48699132755170116, + "loss": 0.9554, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "grad_norm": 2.7962470054626465, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "learning_rate": 0.00029925166814607585, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "loss": 1.372177243232727, + "step": 1460 + }, + { + "ce_loss": 0.3510309159755707, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "distill_loss": 0.1559959203004837, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "ref_ce_loss": 0.14250187575817108, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "loss": 0.8158239722251892, + "step": 1460 + }, + { + "ce_loss": 0.32633304595947266, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "distill_loss": 0.1910652071237564, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "ref_ce_loss": 0.16944916546344757, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "loss": 1.0479116439819336, + "step": 1460 + }, + { + "ce_loss": 0.39617040753364563, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "distill_loss": 0.19950151443481445, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "ref_ce_loss": 0.16884933412075043, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "loss": 0.8634979724884033, + "step": 1460 + }, + { + "ce_loss": 0.3990035355091095, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "distill_loss": 0.1979626566171646, + "epoch": 0.48699132755170116, + "step": 1460 + }, + { + "epoch": 0.48699132755170116, + "ref_ce_loss": 0.18388716876506805, + "step": 1460 + }, + { + "epoch": 0.4903268845897265, + "loss": 0.8563, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "grad_norm": 1.9522337913513184, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "learning_rate": 0.00029923132371252993, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "loss": 0.5769478678703308, + "step": 1470 + }, + { + "ce_loss": 0.2803075611591339, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "distill_loss": 0.1305488646030426, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "ref_ce_loss": 0.12108226865530014, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "loss": 0.784702479839325, + "step": 1470 + }, + { + "ce_loss": 0.33124253153800964, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "distill_loss": 0.1394960582256317, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "ref_ce_loss": 0.20602154731750488, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "loss": 0.5377941131591797, + "step": 1470 + }, + { + "ce_loss": 0.2358846515417099, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "distill_loss": 0.1433420330286026, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "ref_ce_loss": 0.14286667108535767, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "loss": 0.8376799821853638, + "step": 1470 + }, + { + "ce_loss": 0.2516549825668335, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "distill_loss": 0.11286590993404388, + "epoch": 0.4903268845897265, + "step": 1470 + }, + { + "epoch": 0.4903268845897265, + "ref_ce_loss": 0.09511788934469223, + "step": 1470 + }, + { + "epoch": 0.49366244162775186, + "loss": 0.8752, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "grad_norm": 3.132399797439575, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "learning_rate": 0.0002992107071399165, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "loss": 1.5298254489898682, + "step": 1480 + }, + { + "ce_loss": 0.39849844574928284, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "distill_loss": 0.16513606905937195, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "ref_ce_loss": 0.2410481870174408, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "loss": 1.0180957317352295, + "step": 1480 + }, + { + "ce_loss": 0.33581236004829407, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "distill_loss": 0.15931996703147888, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "ref_ce_loss": 0.21301725506782532, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "loss": 0.7502650022506714, + "step": 1480 + }, + { + "ce_loss": 0.382405549287796, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "distill_loss": 0.12572389841079712, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "ref_ce_loss": 0.17360630631446838, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "loss": 1.0398483276367188, + "step": 1480 + }, + { + "ce_loss": 0.30432286858558655, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "distill_loss": 0.14817538857460022, + "epoch": 0.49366244162775186, + "step": 1480 + }, + { + "epoch": 0.49366244162775186, + "ref_ce_loss": 0.17467889189720154, + "step": 1480 + }, + { + "epoch": 0.4969979986657772, + "loss": 0.8252, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "grad_norm": 4.466595649719238, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "learning_rate": 0.0002991898184658321, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "loss": 0.729331910610199, + "step": 1490 + }, + { + "ce_loss": 0.22167912125587463, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "distill_loss": 0.1331212818622589, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "ref_ce_loss": 0.10691002011299133, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "loss": 0.8190153241157532, + "step": 1490 + }, + { + "ce_loss": 0.3913503587245941, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "distill_loss": 0.11496517062187195, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "ref_ce_loss": 0.19716744124889374, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "loss": 0.6939219832420349, + "step": 1490 + }, + { + "ce_loss": 0.3533763289451599, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "distill_loss": 0.12615175545215607, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "ref_ce_loss": 0.21436937153339386, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "loss": 0.7554702758789062, + "step": 1490 + }, + { + "ce_loss": 0.2819823920726776, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "distill_loss": 0.13555271923542023, + "epoch": 0.4969979986657772, + "step": 1490 + }, + { + "epoch": 0.4969979986657772, + "ref_ce_loss": 0.224581778049469, + "step": 1490 + }, + { + "epoch": 0.5003335557038026, + "loss": 0.8087, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "grad_norm": 4.537039279937744, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "learning_rate": 0.0002991686577283694, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "loss": 0.8498672246932983, + "step": 1500 + }, + { + "ce_loss": 0.33424246311187744, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "distill_loss": 0.11146187782287598, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "ref_ce_loss": 0.133957639336586, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "loss": 0.7086281180381775, + "step": 1500 + }, + { + "ce_loss": 0.29514920711517334, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "distill_loss": 0.09464933723211288, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "ref_ce_loss": 0.21752260625362396, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "loss": 1.3933093547821045, + "step": 1500 + }, + { + "ce_loss": 0.5271459817886353, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "distill_loss": 0.14879895746707916, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "ref_ce_loss": 0.24883712828159332, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "loss": 0.5399357080459595, + "step": 1500 + }, + { + "ce_loss": 0.23881879448890686, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "distill_loss": 0.09969379752874374, + "epoch": 0.5003335557038026, + "step": 1500 + }, + { + "epoch": 0.5003335557038026, + "ref_ce_loss": 0.12885983288288116, + "step": 1500 + }, + { + "epoch": 0.5036691127418279, + "loss": 0.8127, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "grad_norm": 1.9899190664291382, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "learning_rate": 0.0002991472249661172, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "loss": 0.611236572265625, + "step": 1510 + }, + { + "ce_loss": 0.2055187225341797, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "distill_loss": 0.1261042356491089, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "ref_ce_loss": 0.17034177482128143, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "loss": 1.0687274932861328, + "step": 1510 + }, + { + "ce_loss": 0.2887864410877228, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "distill_loss": 0.13521718978881836, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "ref_ce_loss": 0.17292891442775726, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "loss": 0.822563111782074, + "step": 1510 + }, + { + "ce_loss": 0.3748740255832672, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "distill_loss": 0.1336393654346466, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "ref_ce_loss": 0.2078617364168167, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "loss": 0.7008830904960632, + "step": 1510 + }, + { + "ce_loss": 0.31130093336105347, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "distill_loss": 0.1265094131231308, + "epoch": 0.5036691127418279, + "step": 1510 + }, + { + "epoch": 0.5036691127418279, + "ref_ce_loss": 0.18166042864322662, + "step": 1510 + }, + { + "epoch": 0.5070046697798533, + "loss": 0.7516, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "grad_norm": 3.6005780696868896, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "learning_rate": 0.00029912552021816045, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "loss": 0.7088308930397034, + "step": 1520 + }, + { + "ce_loss": 0.252478688955307, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "distill_loss": 0.08968272060155869, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "ref_ce_loss": 0.1475578248500824, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "loss": 0.718932032585144, + "step": 1520 + }, + { + "ce_loss": 0.35802438855171204, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "distill_loss": 0.10070653259754181, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "ref_ce_loss": 0.12593254446983337, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "loss": 0.6470715403556824, + "step": 1520 + }, + { + "ce_loss": 0.2930530607700348, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "distill_loss": 0.11143629252910614, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "ref_ce_loss": 0.17003856599330902, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "loss": 0.6389113664627075, + "step": 1520 + }, + { + "ce_loss": 0.37416961789131165, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "distill_loss": 0.11495236307382584, + "epoch": 0.5070046697798533, + "step": 1520 + }, + { + "epoch": 0.5070046697798533, + "ref_ce_loss": 0.1497492641210556, + "step": 1520 + }, + { + "epoch": 0.5103402268178786, + "loss": 0.7928, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "grad_norm": 2.498560667037964, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "learning_rate": 0.00029910354352408, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "loss": 0.6787996888160706, + "step": 1530 + }, + { + "ce_loss": 0.26317986845970154, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "distill_loss": 0.11007644236087799, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "ref_ce_loss": 0.22199080884456635, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "loss": 0.5043588876724243, + "step": 1530 + }, + { + "ce_loss": 0.2592538893222809, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "distill_loss": 0.12781588733196259, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "ref_ce_loss": 0.11714787781238556, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "loss": 0.6850191950798035, + "step": 1530 + }, + { + "ce_loss": 0.3347513973712921, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "distill_loss": 0.11932611465454102, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "ref_ce_loss": 0.23088660836219788, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "loss": 0.5668002367019653, + "step": 1530 + }, + { + "ce_loss": 0.20559754967689514, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "distill_loss": 0.11886685341596603, + "epoch": 0.5103402268178786, + "step": 1530 + }, + { + "epoch": 0.5103402268178786, + "ref_ce_loss": 0.16831466555595398, + "step": 1530 + }, + { + "epoch": 0.513675783855904, + "loss": 0.8024, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "grad_norm": 3.3080246448516846, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "learning_rate": 0.0002990812949239528, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "loss": 0.6707828044891357, + "step": 1540 + }, + { + "ce_loss": 0.2728685736656189, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "distill_loss": 0.12154404073953629, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "ref_ce_loss": 0.18021827936172485, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "loss": 0.7965375185012817, + "step": 1540 + }, + { + "ce_loss": 0.34719210863113403, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "distill_loss": 0.09814593195915222, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "ref_ce_loss": 0.25575193762779236, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "loss": 0.8639137148857117, + "step": 1540 + }, + { + "ce_loss": 0.3147978186607361, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "distill_loss": 0.11294306814670563, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "ref_ce_loss": 0.2474347949028015, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "loss": 0.4631844758987427, + "step": 1540 + }, + { + "ce_loss": 0.2097543329000473, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "distill_loss": 0.09885016828775406, + "epoch": 0.513675783855904, + "step": 1540 + }, + { + "epoch": 0.513675783855904, + "ref_ce_loss": 0.1541988104581833, + "step": 1540 + }, + { + "epoch": 0.5170113408939293, + "loss": 0.7923, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "grad_norm": 1.7913470268249512, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "learning_rate": 0.0002990587744583514, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "loss": 0.7131497859954834, + "step": 1550 + }, + { + "ce_loss": 0.35181015729904175, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "distill_loss": 0.11702708154916763, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "ref_ce_loss": 0.18052375316619873, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "loss": 0.707023024559021, + "step": 1550 + }, + { + "ce_loss": 0.27232182025909424, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "distill_loss": 0.10445040464401245, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "ref_ce_loss": 0.13911911845207214, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "loss": 0.6784090995788574, + "step": 1550 + }, + { + "ce_loss": 0.2939833402633667, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "distill_loss": 0.08660943061113358, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "ref_ce_loss": 0.18671560287475586, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "loss": 1.2959554195404053, + "step": 1550 + }, + { + "ce_loss": 0.3662611246109009, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "distill_loss": 0.11476387083530426, + "epoch": 0.5170113408939293, + "step": 1550 + }, + { + "epoch": 0.5170113408939293, + "ref_ce_loss": 0.1677011251449585, + "step": 1550 + }, + { + "epoch": 0.5203468979319547, + "loss": 0.8326, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "grad_norm": 2.679382562637329, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "learning_rate": 0.0002990359821683443, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "loss": 0.5702127814292908, + "step": 1560 + }, + { + "ce_loss": 0.21940156817436218, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "distill_loss": 0.1002032607793808, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "ref_ce_loss": 0.16673019528388977, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "loss": 0.8199391961097717, + "step": 1560 + }, + { + "ce_loss": 0.2875426709651947, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "distill_loss": 0.1570519655942917, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "ref_ce_loss": 0.1424614042043686, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "loss": 0.8121156692504883, + "step": 1560 + }, + { + "ce_loss": 0.3326272964477539, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "distill_loss": 0.13725823163986206, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "ref_ce_loss": 0.2615748643875122, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "loss": 0.8154400587081909, + "step": 1560 + }, + { + "ce_loss": 0.2914092242717743, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "distill_loss": 0.14719057083129883, + "epoch": 0.5203468979319547, + "step": 1560 + }, + { + "epoch": 0.5203468979319547, + "ref_ce_loss": 0.21865858137607574, + "step": 1560 + }, + { + "epoch": 0.52368245496998, + "loss": 0.7554, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "grad_norm": 3.8052194118499756, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "learning_rate": 0.0002990129180954956, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "loss": 0.7150813341140747, + "step": 1570 + }, + { + "ce_loss": 0.18721269071102142, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "distill_loss": 0.10450568795204163, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "ref_ce_loss": 0.10055077075958252, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "loss": 0.7558162212371826, + "step": 1570 + }, + { + "ce_loss": 0.25569626688957214, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "distill_loss": 0.12277135252952576, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "ref_ce_loss": 0.12362133711576462, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "loss": 1.325268030166626, + "step": 1570 + }, + { + "ce_loss": 0.3507300913333893, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "distill_loss": 0.1343396157026291, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "ref_ce_loss": 0.2221112996339798, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "loss": 0.6346928477287292, + "step": 1570 + }, + { + "ce_loss": 0.2960171401500702, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "distill_loss": 0.1436004638671875, + "epoch": 0.52368245496998, + "step": 1570 + }, + { + "epoch": 0.52368245496998, + "ref_ce_loss": 0.11697656661272049, + "step": 1570 + }, + { + "epoch": 0.5270180120080054, + "loss": 0.7996, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "grad_norm": 1.8267759084701538, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "learning_rate": 0.0002989895822818651, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "loss": 0.6526615619659424, + "step": 1580 + }, + { + "ce_loss": 0.15909507870674133, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "distill_loss": 0.12408677488565445, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "ref_ce_loss": 0.19116757810115814, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "loss": 0.5174135565757751, + "step": 1580 + }, + { + "ce_loss": 0.16595187783241272, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "distill_loss": 0.13370013236999512, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "ref_ce_loss": 0.1358727216720581, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "loss": 0.6547532677650452, + "step": 1580 + }, + { + "ce_loss": 0.30841338634490967, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "distill_loss": 0.1410057544708252, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "ref_ce_loss": 0.15567293763160706, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "loss": 0.6471787691116333, + "step": 1580 + }, + { + "ce_loss": 0.27511271834373474, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "distill_loss": 0.136945903301239, + "epoch": 0.5270180120080054, + "step": 1580 + }, + { + "epoch": 0.5270180120080054, + "ref_ce_loss": 0.23433144390583038, + "step": 1580 + }, + { + "epoch": 0.5303535690460307, + "loss": 0.7405, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "grad_norm": 1.9112555980682373, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "learning_rate": 0.00029896597477000803, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "loss": 0.8480120301246643, + "step": 1590 + }, + { + "ce_loss": 0.2612304091453552, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "distill_loss": 0.12277919799089432, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "ref_ce_loss": 0.14804160594940186, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "loss": 0.6312728524208069, + "step": 1590 + }, + { + "ce_loss": 0.299432635307312, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "distill_loss": 0.13859985768795013, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "ref_ce_loss": 0.19271425902843475, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "loss": 0.5959420800209045, + "step": 1590 + }, + { + "ce_loss": 0.27959901094436646, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "distill_loss": 0.12362391501665115, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "ref_ce_loss": 0.1926553100347519, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "loss": 0.6874289512634277, + "step": 1590 + }, + { + "ce_loss": 0.3542376160621643, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "distill_loss": 0.1559729278087616, + "epoch": 0.5303535690460307, + "step": 1590 + }, + { + "epoch": 0.5303535690460307, + "ref_ce_loss": 0.1770813763141632, + "step": 1590 + }, + { + "epoch": 0.533689126084056, + "loss": 0.7949, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "grad_norm": 1.7462742328643799, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "learning_rate": 0.00029894209560297536, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "loss": 0.7205913066864014, + "step": 1600 + }, + { + "ce_loss": 0.3525589406490326, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "distill_loss": 0.16837641596794128, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "ref_ce_loss": 0.19949723780155182, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "loss": 1.0897554159164429, + "step": 1600 + }, + { + "ce_loss": 0.24649381637573242, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "distill_loss": 0.14514781534671783, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "ref_ce_loss": 0.20254717767238617, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "loss": 0.6646853089332581, + "step": 1600 + }, + { + "ce_loss": 0.2682981491088867, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "distill_loss": 0.1883355677127838, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "ref_ce_loss": 0.11741480231285095, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "loss": 0.6180636882781982, + "step": 1600 + }, + { + "ce_loss": 0.2548883855342865, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "distill_loss": 0.16024237871170044, + "epoch": 0.533689126084056, + "step": 1600 + }, + { + "epoch": 0.533689126084056, + "ref_ce_loss": 0.10766757279634476, + "step": 1600 + }, + { + "epoch": 0.5370246831220814, + "loss": 0.8356, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "grad_norm": 2.262484312057495, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "learning_rate": 0.00029891794482431313, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "loss": 1.4120798110961914, + "step": 1610 + }, + { + "ce_loss": 0.3346101939678192, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "distill_loss": 0.14435593783855438, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "ref_ce_loss": 0.17607466876506805, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "loss": 0.8668168187141418, + "step": 1610 + }, + { + "ce_loss": 0.2469944804906845, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "distill_loss": 0.11261950433254242, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "ref_ce_loss": 0.20903988182544708, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "loss": 0.6093182563781738, + "step": 1610 + }, + { + "ce_loss": 0.27436932921409607, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "distill_loss": 0.1148870661854744, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "ref_ce_loss": 0.21992698311805725, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "loss": 0.9816043376922607, + "step": 1610 + }, + { + "ce_loss": 0.3318880796432495, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "distill_loss": 0.1468740850687027, + "epoch": 0.5370246831220814, + "step": 1610 + }, + { + "epoch": 0.5370246831220814, + "ref_ce_loss": 0.17123477160930634, + "step": 1610 + }, + { + "epoch": 0.5403602401601068, + "loss": 0.907, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "grad_norm": 4.384459972381592, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "learning_rate": 0.0002988935224780629, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "loss": 0.7424207329750061, + "step": 1620 + }, + { + "ce_loss": 0.3590454161167145, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "distill_loss": 0.17425327003002167, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "ref_ce_loss": 0.20896899700164795, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "loss": 0.6844779849052429, + "step": 1620 + }, + { + "ce_loss": 0.3099417984485626, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "distill_loss": 0.14133571088314056, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "ref_ce_loss": 0.15446357429027557, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "loss": 0.7723664045333862, + "step": 1620 + }, + { + "ce_loss": 0.3412700593471527, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "distill_loss": 0.14123709499835968, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "ref_ce_loss": 0.18426023423671722, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "loss": 1.0038042068481445, + "step": 1620 + }, + { + "ce_loss": 0.2636459171772003, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "distill_loss": 0.14897647500038147, + "epoch": 0.5403602401601068, + "step": 1620 + }, + { + "epoch": 0.5403602401601068, + "ref_ce_loss": 0.16534221172332764, + "step": 1620 + }, + { + "epoch": 0.5436957971981321, + "loss": 0.767, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "grad_norm": 1.8759695291519165, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "learning_rate": 0.00029886882860876134, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "loss": 0.6272376775741577, + "step": 1630 + }, + { + "ce_loss": 0.21282872557640076, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "distill_loss": 0.11778245866298676, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "ref_ce_loss": 0.20675376057624817, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "loss": 0.5564039945602417, + "step": 1630 + }, + { + "ce_loss": 0.19067807495594025, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "distill_loss": 0.12105537950992584, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "ref_ce_loss": 0.1097465455532074, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "loss": 0.9002183675765991, + "step": 1630 + }, + { + "ce_loss": 0.386663019657135, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "distill_loss": 0.1761159598827362, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "ref_ce_loss": 0.21742606163024902, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "loss": 0.7056699395179749, + "step": 1630 + }, + { + "ce_loss": 0.23579426109790802, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "distill_loss": 0.13872799277305603, + "epoch": 0.5436957971981321, + "step": 1630 + }, + { + "epoch": 0.5436957971981321, + "ref_ce_loss": 0.12659761309623718, + "step": 1630 + }, + { + "epoch": 0.5470313542361575, + "loss": 0.8049, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "grad_norm": 1.8977726697921753, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "learning_rate": 0.0002988438632614404, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "loss": 0.73987877368927, + "step": 1640 + }, + { + "ce_loss": 0.337587833404541, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "distill_loss": 0.11915989220142365, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "ref_ce_loss": 0.16175560653209686, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "loss": 1.201418161392212, + "step": 1640 + }, + { + "ce_loss": 0.3867391049861908, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "distill_loss": 0.1336623579263687, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "ref_ce_loss": 0.20569270849227905, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "loss": 1.7483572959899902, + "step": 1640 + }, + { + "ce_loss": 0.2928985059261322, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "distill_loss": 0.1285347044467926, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "ref_ce_loss": 0.19875165820121765, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "loss": 0.7967950701713562, + "step": 1640 + }, + { + "ce_loss": 0.39166033267974854, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "distill_loss": 0.1525496244430542, + "epoch": 0.5470313542361575, + "step": 1640 + }, + { + "epoch": 0.5470313542361575, + "ref_ce_loss": 0.25225165486335754, + "step": 1640 + }, + { + "epoch": 0.5503669112741828, + "loss": 0.8833, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "grad_norm": 4.775393486022949, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "learning_rate": 0.00029881862648162695, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "loss": 0.8248935341835022, + "step": 1650 + }, + { + "ce_loss": 0.34251272678375244, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "distill_loss": 0.15160521864891052, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "ref_ce_loss": 0.120022252202034, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "loss": 0.7766433358192444, + "step": 1650 + }, + { + "ce_loss": 0.2661416530609131, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "distill_loss": 0.11151763796806335, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "ref_ce_loss": 0.18171195685863495, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "loss": 0.6041874885559082, + "step": 1650 + }, + { + "ce_loss": 0.25433608889579773, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "distill_loss": 0.12476801872253418, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "ref_ce_loss": 0.1334761679172516, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "loss": 0.5187742710113525, + "step": 1650 + }, + { + "ce_loss": 0.26460695266723633, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "distill_loss": 0.13951659202575684, + "epoch": 0.5503669112741828, + "step": 1650 + }, + { + "epoch": 0.5503669112741828, + "ref_ce_loss": 0.10023458302021027, + "step": 1650 + }, + { + "epoch": 0.5537024683122082, + "loss": 0.7497, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "grad_norm": 2.310690402984619, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "learning_rate": 0.0002987931183153429, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "loss": 0.5635334253311157, + "step": 1660 + }, + { + "ce_loss": 0.2891744375228882, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "distill_loss": 0.14947661757469177, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "ref_ce_loss": 0.12474282830953598, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "loss": 0.8831844329833984, + "step": 1660 + }, + { + "ce_loss": 0.41249531507492065, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "distill_loss": 0.16750311851501465, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "ref_ce_loss": 0.19170571863651276, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "loss": 0.8272402882575989, + "step": 1660 + }, + { + "ce_loss": 0.3753838539123535, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "distill_loss": 0.15682774782180786, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "ref_ce_loss": 0.128701314330101, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "loss": 0.8349854946136475, + "step": 1660 + }, + { + "ce_loss": 0.24851606786251068, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "distill_loss": 0.10116152465343475, + "epoch": 0.5537024683122082, + "step": 1660 + }, + { + "epoch": 0.5537024683122082, + "ref_ce_loss": 0.22549472749233246, + "step": 1660 + }, + { + "epoch": 0.5570380253502335, + "loss": 0.8069, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "grad_norm": 3.4684464931488037, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "learning_rate": 0.00029876733880910525, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "loss": 0.5241039991378784, + "step": 1670 + }, + { + "ce_loss": 0.2068227082490921, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "distill_loss": 0.10971537977457047, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "ref_ce_loss": 0.14565956592559814, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "loss": 0.5024033784866333, + "step": 1670 + }, + { + "ce_loss": 0.23021751642227173, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "distill_loss": 0.10708209127187729, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "ref_ce_loss": 0.1649056375026703, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "loss": 0.9969327449798584, + "step": 1670 + }, + { + "ce_loss": 0.33395951986312866, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "distill_loss": 0.1525161862373352, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "ref_ce_loss": 0.16089755296707153, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "loss": 0.8883379101753235, + "step": 1670 + }, + { + "ce_loss": 0.3117881715297699, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "distill_loss": 0.11904864758253098, + "epoch": 0.5570380253502335, + "step": 1670 + }, + { + "epoch": 0.5570380253502335, + "ref_ce_loss": 0.21285606920719147, + "step": 1670 + }, + { + "epoch": 0.5603735823882589, + "loss": 0.7748, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "grad_norm": 2.624232292175293, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "learning_rate": 0.00029874128800992547, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "loss": 0.6730021834373474, + "step": 1680 + }, + { + "ce_loss": 0.2944241762161255, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "distill_loss": 0.15666545927524567, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "ref_ce_loss": 0.15864133834838867, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "loss": 0.8667658567428589, + "step": 1680 + }, + { + "ce_loss": 0.4391118288040161, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "distill_loss": 0.127987802028656, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "ref_ce_loss": 0.21471063792705536, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "loss": 0.6558547616004944, + "step": 1680 + }, + { + "ce_loss": 0.3125191628932953, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "distill_loss": 0.11761464178562164, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "ref_ce_loss": 0.17394804954528809, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "loss": 0.8694894313812256, + "step": 1680 + }, + { + "ce_loss": 0.38176044821739197, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "distill_loss": 0.13654875755310059, + "epoch": 0.5603735823882589, + "step": 1680 + }, + { + "epoch": 0.5603735823882589, + "ref_ce_loss": 0.20033635199069977, + "step": 1680 + }, + { + "epoch": 0.5637091394262842, + "loss": 0.8365, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "grad_norm": 3.2551002502441406, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "learning_rate": 0.00029871496596531, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "loss": 1.111056923866272, + "step": 1690 + }, + { + "ce_loss": 0.23771966993808746, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "distill_loss": 0.11728285998106003, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "ref_ce_loss": 0.10274969041347504, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "loss": 0.8305438756942749, + "step": 1690 + }, + { + "ce_loss": 0.33233582973480225, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "distill_loss": 0.1357385665178299, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "ref_ce_loss": 0.118039071559906, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "loss": 0.844835638999939, + "step": 1690 + }, + { + "ce_loss": 0.25645163655281067, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "distill_loss": 0.10402382165193558, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "ref_ce_loss": 0.1545470505952835, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "loss": 0.615341067314148, + "step": 1690 + }, + { + "ce_loss": 0.24444253742694855, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "distill_loss": 0.09630996733903885, + "epoch": 0.5637091394262842, + "step": 1690 + }, + { + "epoch": 0.5637091394262842, + "ref_ce_loss": 0.12920692563056946, + "step": 1690 + }, + { + "epoch": 0.5670446964643095, + "loss": 0.7819, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "grad_norm": 2.2229459285736084, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "learning_rate": 0.00029868837272325994, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "loss": 0.6957921981811523, + "step": 1700 + }, + { + "ce_loss": 0.1739863008260727, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "distill_loss": 0.10056845843791962, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "ref_ce_loss": 0.12460286170244217, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "loss": 0.725322425365448, + "step": 1700 + }, + { + "ce_loss": 0.3335936963558197, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "distill_loss": 0.12524354457855225, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "ref_ce_loss": 0.15567870438098907, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "loss": 0.6163183450698853, + "step": 1700 + }, + { + "ce_loss": 0.24315491318702698, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "distill_loss": 0.1256161630153656, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "ref_ce_loss": 0.1663728654384613, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "loss": 1.0663681030273438, + "step": 1700 + }, + { + "ce_loss": 0.2814297378063202, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "distill_loss": 0.13603630661964417, + "epoch": 0.5670446964643095, + "step": 1700 + }, + { + "epoch": 0.5670446964643095, + "ref_ce_loss": 0.1546018123626709, + "step": 1700 + }, + { + "epoch": 0.5703802535023349, + "loss": 0.7102, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "grad_norm": 2.1799120903015137, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "learning_rate": 0.0002986615083322708, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "loss": 0.7568995952606201, + "step": 1710 + }, + { + "ce_loss": 0.3033449053764343, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "distill_loss": 0.10636651515960693, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "ref_ce_loss": 0.1264645904302597, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "loss": 0.6913808584213257, + "step": 1710 + }, + { + "ce_loss": 0.2926919460296631, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "distill_loss": 0.14209294319152832, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "ref_ce_loss": 0.16757725179195404, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "loss": 0.5459708571434021, + "step": 1710 + }, + { + "ce_loss": 0.2636614441871643, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "distill_loss": 0.14287355542182922, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "ref_ce_loss": 0.13888999819755554, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "loss": 1.1922392845153809, + "step": 1710 + }, + { + "ce_loss": 0.22226674854755402, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "distill_loss": 0.10548727214336395, + "epoch": 0.5703802535023349, + "step": 1710 + }, + { + "epoch": 0.5703802535023349, + "ref_ce_loss": 0.12790684401988983, + "step": 1710 + }, + { + "epoch": 0.5737158105403602, + "loss": 0.8126, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "grad_norm": 3.163691759109497, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "learning_rate": 0.0002986343728413326, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "loss": 0.9181846380233765, + "step": 1720 + }, + { + "ce_loss": 0.3268508017063141, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "distill_loss": 0.13751642405986786, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "ref_ce_loss": 0.16825208067893982, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "loss": 0.8629528284072876, + "step": 1720 + }, + { + "ce_loss": 0.27521952986717224, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "distill_loss": 0.14840242266654968, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "ref_ce_loss": 0.11532442271709442, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "loss": 0.5458701848983765, + "step": 1720 + }, + { + "ce_loss": 0.24639998376369476, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "distill_loss": 0.13376955687999725, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "ref_ce_loss": 0.16517247259616852, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "loss": 0.6869808435440063, + "step": 1720 + }, + { + "ce_loss": 0.254602313041687, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "distill_loss": 0.13055302202701569, + "epoch": 0.5737158105403602, + "step": 1720 + }, + { + "epoch": 0.5737158105403602, + "ref_ce_loss": 0.152826726436615, + "step": 1720 + }, + { + "epoch": 0.5770513675783856, + "loss": 0.8095, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "grad_norm": 5.802720546722412, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "learning_rate": 0.0002986069662999298, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "loss": 0.8158540725708008, + "step": 1730 + }, + { + "ce_loss": 0.2759875953197479, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "distill_loss": 0.151884526014328, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "ref_ce_loss": 0.1573365181684494, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "loss": 0.932378888130188, + "step": 1730 + }, + { + "ce_loss": 0.3446933329105377, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "distill_loss": 0.1564919650554657, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "ref_ce_loss": 0.16395673155784607, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "loss": 0.7885000705718994, + "step": 1730 + }, + { + "ce_loss": 0.26280203461647034, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "distill_loss": 0.134730726480484, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "ref_ce_loss": 0.12668967247009277, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "loss": 0.7057376503944397, + "step": 1730 + }, + { + "ce_loss": 0.29862749576568604, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "distill_loss": 0.1523112654685974, + "epoch": 0.5770513675783856, + "step": 1730 + }, + { + "epoch": 0.5770513675783856, + "ref_ce_loss": 0.18865908682346344, + "step": 1730 + }, + { + "epoch": 0.580386924616411, + "loss": 0.8146, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "grad_norm": 2.1819822788238525, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "learning_rate": 0.0002985792887580412, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "loss": 0.4795108139514923, + "step": 1740 + }, + { + "ce_loss": 0.1699635088443756, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "distill_loss": 0.12625162303447723, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "ref_ce_loss": 0.1395426243543625, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "loss": 1.1031620502471924, + "step": 1740 + }, + { + "ce_loss": 0.2764243185520172, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "distill_loss": 0.16979770362377167, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "ref_ce_loss": 0.22301478683948517, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "loss": 0.7303873896598816, + "step": 1740 + }, + { + "ce_loss": 0.3002098500728607, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "distill_loss": 0.15270881354808807, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "ref_ce_loss": 0.17616495490074158, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "loss": 0.7694774866104126, + "step": 1740 + }, + { + "ce_loss": 0.30607903003692627, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "distill_loss": 0.13875719904899597, + "epoch": 0.580386924616411, + "step": 1740 + }, + { + "epoch": 0.580386924616411, + "ref_ce_loss": 0.20177951455116272, + "step": 1740 + }, + { + "epoch": 0.5837224816544363, + "loss": 0.8148, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "grad_norm": 2.0567026138305664, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "learning_rate": 0.00029855134026613963, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "loss": 0.6456379294395447, + "step": 1750 + }, + { + "ce_loss": 0.2544163763523102, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "distill_loss": 0.15112867951393127, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "ref_ce_loss": 0.13789910078048706, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "loss": 0.6710749864578247, + "step": 1750 + }, + { + "ce_loss": 0.22394444048404694, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "distill_loss": 0.13627192378044128, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "ref_ce_loss": 0.21288058161735535, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "loss": 0.8853602409362793, + "step": 1750 + }, + { + "ce_loss": 0.33082449436187744, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "distill_loss": 0.17533904314041138, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "ref_ce_loss": 0.23648829758167267, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "loss": 0.6773439645767212, + "step": 1750 + }, + { + "ce_loss": 0.258245587348938, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "distill_loss": 0.1750703752040863, + "epoch": 0.5837224816544363, + "step": 1750 + }, + { + "epoch": 0.5837224816544363, + "ref_ce_loss": 0.16324280202388763, + "step": 1750 + }, + { + "epoch": 0.5870580386924616, + "loss": 0.7823, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "grad_norm": 3.1371705532073975, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "learning_rate": 0.0002985231208751921, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "loss": 1.2690577507019043, + "step": 1760 + }, + { + "ce_loss": 0.36192673444747925, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "distill_loss": 0.1863507330417633, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "ref_ce_loss": 0.1751195639371872, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "loss": 0.8799837231636047, + "step": 1760 + }, + { + "ce_loss": 0.349637895822525, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "distill_loss": 0.1971992552280426, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "ref_ce_loss": 0.08908438682556152, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "loss": 0.8331767916679382, + "step": 1760 + }, + { + "ce_loss": 0.32722359895706177, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "distill_loss": 0.17739370465278625, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "ref_ce_loss": 0.17605829238891602, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "loss": 0.6821324825286865, + "step": 1760 + }, + { + "ce_loss": 0.28926607966423035, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "distill_loss": 0.20537421107292175, + "epoch": 0.5870580386924616, + "step": 1760 + }, + { + "epoch": 0.5870580386924616, + "ref_ce_loss": 0.12078510224819183, + "step": 1760 + }, + { + "epoch": 0.590393595730487, + "loss": 0.8654, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "grad_norm": 4.211384296417236, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "learning_rate": 0.00029849463063665965, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "loss": 0.622290849685669, + "step": 1770 + }, + { + "ce_loss": 0.2384355068206787, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "distill_loss": 0.14726680517196655, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "ref_ce_loss": 0.18940134346485138, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "loss": 0.5565788149833679, + "step": 1770 + }, + { + "ce_loss": 0.2837248146533966, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "distill_loss": 0.1558532416820526, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "ref_ce_loss": 0.11656032502651215, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "loss": 1.1380963325500488, + "step": 1770 + }, + { + "ce_loss": 0.41929349303245544, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "distill_loss": 0.2181171178817749, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "ref_ce_loss": 0.17369946837425232, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "loss": 0.6213510632514954, + "step": 1770 + }, + { + "ce_loss": 0.24765095114707947, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "distill_loss": 0.193634033203125, + "epoch": 0.590393595730487, + "step": 1770 + }, + { + "epoch": 0.590393595730487, + "ref_ce_loss": 0.12431634962558746, + "step": 1770 + }, + { + "epoch": 0.5937291527685123, + "loss": 0.8174, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "grad_norm": 2.910914659500122, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "learning_rate": 0.00029846586960249736, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "loss": 1.329070806503296, + "step": 1780 + }, + { + "ce_loss": 0.19489771127700806, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "distill_loss": 0.1673344075679779, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "ref_ce_loss": 0.1715712547302246, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "loss": 0.7473481297492981, + "step": 1780 + }, + { + "ce_loss": 0.2816266715526581, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "distill_loss": 0.20028121769428253, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "ref_ce_loss": 0.13073772192001343, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "loss": 1.6454558372497559, + "step": 1780 + }, + { + "ce_loss": 0.4495353102684021, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "distill_loss": 0.18965190649032593, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "ref_ce_loss": 0.19479434192180634, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "loss": 0.7187461256980896, + "step": 1780 + }, + { + "ce_loss": 0.2073751837015152, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "distill_loss": 0.18652737140655518, + "epoch": 0.5937291527685123, + "step": 1780 + }, + { + "epoch": 0.5937291527685123, + "ref_ce_loss": 0.12575000524520874, + "step": 1780 + }, + { + "epoch": 0.5970647098065377, + "loss": 0.8508, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "grad_norm": 2.6389963626861572, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "learning_rate": 0.0002984368378251539, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "loss": 0.9331235885620117, + "step": 1790 + }, + { + "ce_loss": 0.3901205360889435, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "distill_loss": 0.26576167345046997, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "ref_ce_loss": 0.15986163914203644, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "loss": 0.6947316527366638, + "step": 1790 + }, + { + "ce_loss": 0.21353313326835632, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "distill_loss": 0.2191469818353653, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "ref_ce_loss": 0.15555934607982635, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "loss": 0.8559675216674805, + "step": 1790 + }, + { + "ce_loss": 0.3997640013694763, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "distill_loss": 0.20023198425769806, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "ref_ce_loss": 0.2548025846481323, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "loss": 0.5783416032791138, + "step": 1790 + }, + { + "ce_loss": 0.23126862943172455, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "distill_loss": 0.19612915813922882, + "epoch": 0.5970647098065377, + "step": 1790 + }, + { + "epoch": 0.5970647098065377, + "ref_ce_loss": 0.1506781429052353, + "step": 1790 + }, + { + "epoch": 0.600400266844563, + "loss": 0.8459, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "grad_norm": 2.6187963485717773, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "learning_rate": 0.0002984075353575718, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "loss": 0.7321758270263672, + "step": 1800 + }, + { + "ce_loss": 0.2416120320558548, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "distill_loss": 0.2531052231788635, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "ref_ce_loss": 0.1594499945640564, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "loss": 1.1592686176300049, + "step": 1800 + }, + { + "ce_loss": 0.24422048032283783, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "distill_loss": 0.29225999116897583, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "ref_ce_loss": 0.17981497943401337, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "loss": 0.8788959980010986, + "step": 1800 + }, + { + "ce_loss": 0.331389844417572, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "distill_loss": 0.30712389945983887, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "ref_ce_loss": 0.13194140791893005, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "loss": 0.7174859046936035, + "step": 1800 + }, + { + "ce_loss": 0.26181110739707947, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "distill_loss": 0.25586360692977905, + "epoch": 0.600400266844563, + "step": 1800 + }, + { + "epoch": 0.600400266844563, + "ref_ce_loss": 0.1297745257616043, + "step": 1800 + }, + { + "epoch": 0.6037358238825884, + "loss": 0.9145, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "grad_norm": 2.9618608951568604, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "learning_rate": 0.00029837796225318713, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "loss": 0.7965373992919922, + "step": 1810 + }, + { + "ce_loss": 0.2630294859409332, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "distill_loss": 0.25887030363082886, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "ref_ce_loss": 0.12762102484703064, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "loss": 1.3940396308898926, + "step": 1810 + }, + { + "ce_loss": 0.4177006781101227, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "distill_loss": 0.2584906816482544, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "ref_ce_loss": 0.22328118979930878, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "loss": 0.7931407690048218, + "step": 1810 + }, + { + "ce_loss": 0.3327951729297638, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "distill_loss": 0.24718835949897766, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "ref_ce_loss": 0.21300888061523438, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "loss": 1.0378825664520264, + "step": 1810 + }, + { + "ce_loss": 0.34166228771209717, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "distill_loss": 0.2605835795402527, + "epoch": 0.6037358238825884, + "step": 1810 + }, + { + "epoch": 0.6037358238825884, + "ref_ce_loss": 0.14847081899642944, + "step": 1810 + }, + { + "epoch": 0.6070713809206137, + "loss": 0.8354, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "grad_norm": 2.209144353866577, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "learning_rate": 0.00029834811856592974, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "loss": 1.0870418548583984, + "step": 1820 + }, + { + "ce_loss": 0.3245229125022888, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "distill_loss": 0.20548105239868164, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "ref_ce_loss": 0.15288129448890686, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "loss": 1.2108877897262573, + "step": 1820 + }, + { + "ce_loss": 0.2942400276660919, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "distill_loss": 0.18730758130550385, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "ref_ce_loss": 0.12936684489250183, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "loss": 0.8238207101821899, + "step": 1820 + }, + { + "ce_loss": 0.26211613416671753, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "distill_loss": 0.15576207637786865, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "ref_ce_loss": 0.1929386556148529, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "loss": 1.1510733366012573, + "step": 1820 + }, + { + "ce_loss": 0.31120988726615906, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "distill_loss": 0.15629993379116058, + "epoch": 0.6070713809206137, + "step": 1820 + }, + { + "epoch": 0.6070713809206137, + "ref_ce_loss": 0.20821838080883026, + "step": 1820 + }, + { + "epoch": 0.6104069379586391, + "loss": 0.8131, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "grad_norm": 2.3518996238708496, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "learning_rate": 0.0002983180043502226, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "loss": 0.9673494100570679, + "step": 1830 + }, + { + "ce_loss": 0.2245025634765625, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "distill_loss": 0.15006595849990845, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "ref_ce_loss": 0.14450837671756744, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "loss": 1.431778907775879, + "step": 1830 + }, + { + "ce_loss": 0.3772394061088562, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "distill_loss": 0.18321493268013, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "ref_ce_loss": 0.2277681529521942, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "loss": 0.620803713798523, + "step": 1830 + }, + { + "ce_loss": 0.30052056908607483, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "distill_loss": 0.18863649666309357, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "ref_ce_loss": 0.13118411600589752, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "loss": 0.9219392538070679, + "step": 1830 + }, + { + "ce_loss": 0.3350750207901001, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "distill_loss": 0.19591687619686127, + "epoch": 0.6104069379586391, + "step": 1830 + }, + { + "epoch": 0.6104069379586391, + "ref_ce_loss": 0.18633542954921722, + "step": 1830 + }, + { + "epoch": 0.6137424949966644, + "loss": 0.8355, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "grad_norm": 2.3891096115112305, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "learning_rate": 0.0002982876196609822, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "loss": 0.719983696937561, + "step": 1840 + }, + { + "ce_loss": 0.25666162371635437, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "distill_loss": 0.12700149416923523, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "ref_ce_loss": 0.12065713107585907, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "loss": 0.9144517183303833, + "step": 1840 + }, + { + "ce_loss": 0.3790566921234131, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "distill_loss": 0.1404389590024948, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "ref_ce_loss": 0.19421830773353577, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "loss": 0.6303397417068481, + "step": 1840 + }, + { + "ce_loss": 0.35228684544563293, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "distill_loss": 0.12840238213539124, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "ref_ce_loss": 0.14961020648479462, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "loss": 0.6458494663238525, + "step": 1840 + }, + { + "ce_loss": 0.3226067125797272, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "distill_loss": 0.14511506259441376, + "epoch": 0.6137424949966644, + "step": 1840 + }, + { + "epoch": 0.6137424949966644, + "ref_ce_loss": 0.1088472381234169, + "step": 1840 + }, + { + "epoch": 0.6170780520346898, + "loss": 0.827, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "grad_norm": 2.2725181579589844, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "learning_rate": 0.00029825696455361824, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "loss": 0.9817647933959961, + "step": 1850 + }, + { + "ce_loss": 0.30564960837364197, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "distill_loss": 0.15042169392108917, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "ref_ce_loss": 0.18882113695144653, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "loss": 0.8808572292327881, + "step": 1850 + }, + { + "ce_loss": 0.4024255871772766, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "distill_loss": 0.14410771429538727, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "ref_ce_loss": 0.18836361169815063, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "loss": 0.585471510887146, + "step": 1850 + }, + { + "ce_loss": 0.30692368745803833, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "distill_loss": 0.1264827847480774, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "ref_ce_loss": 0.14431552588939667, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "loss": 0.8242551684379578, + "step": 1850 + }, + { + "ce_loss": 0.34889402985572815, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "distill_loss": 0.14129848778247833, + "epoch": 0.6170780520346898, + "step": 1850 + }, + { + "epoch": 0.6170780520346898, + "ref_ce_loss": 0.1428593099117279, + "step": 1850 + }, + { + "epoch": 0.6204136090727151, + "loss": 0.7225, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "grad_norm": 2.643566846847534, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "learning_rate": 0.0002982260390840335, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "loss": 0.7219395637512207, + "step": 1860 + }, + { + "ce_loss": 0.30792996287345886, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "distill_loss": 0.12749861180782318, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "ref_ce_loss": 0.1780974119901657, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "loss": 0.5392085909843445, + "step": 1860 + }, + { + "ce_loss": 0.21018683910369873, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "distill_loss": 0.11033669859170914, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "ref_ce_loss": 0.14142484962940216, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "loss": 1.3683431148529053, + "step": 1860 + }, + { + "ce_loss": 0.28953924775123596, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "distill_loss": 0.12735016644001007, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "ref_ce_loss": 0.16517490148544312, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "loss": 0.9593379497528076, + "step": 1860 + }, + { + "ce_loss": 0.27006015181541443, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "distill_loss": 0.10672731697559357, + "epoch": 0.6204136090727151, + "step": 1860 + }, + { + "epoch": 0.6204136090727151, + "ref_ce_loss": 0.14056552946567535, + "step": 1860 + }, + { + "epoch": 0.6237491661107405, + "loss": 0.8208, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "grad_norm": 2.525805711746216, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "learning_rate": 0.00029819484330862394, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "loss": 0.5869095921516418, + "step": 1870 + }, + { + "ce_loss": 0.27092117071151733, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "distill_loss": 0.11845798045396805, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "ref_ce_loss": 0.11767851561307907, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "loss": 0.4167972505092621, + "step": 1870 + }, + { + "ce_loss": 0.13532961905002594, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "distill_loss": 0.09994292259216309, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "ref_ce_loss": 0.1033366397023201, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "loss": 0.718509316444397, + "step": 1870 + }, + { + "ce_loss": 0.3200066089630127, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "distill_loss": 0.15193799138069153, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "ref_ce_loss": 0.15644268691539764, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "loss": 1.9445092678070068, + "step": 1870 + }, + { + "ce_loss": 0.27009880542755127, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "distill_loss": 0.11126744747161865, + "epoch": 0.6237491661107405, + "step": 1870 + }, + { + "epoch": 0.6237491661107405, + "ref_ce_loss": 0.21282550692558289, + "step": 1870 + }, + { + "epoch": 0.6270847231487658, + "loss": 0.7884, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "grad_norm": 3.6953253746032715, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "learning_rate": 0.0002981633772842782, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "loss": 0.6420892477035522, + "step": 1880 + }, + { + "ce_loss": 0.36057618260383606, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "distill_loss": 0.1327221393585205, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "ref_ce_loss": 0.14876747131347656, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "loss": 0.6036034822463989, + "step": 1880 + }, + { + "ce_loss": 0.22983922064304352, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "distill_loss": 0.11601034551858902, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "ref_ce_loss": 0.18308429419994354, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "loss": 0.4442928433418274, + "step": 1880 + }, + { + "ce_loss": 0.17181488871574402, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "distill_loss": 0.09428546577692032, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "ref_ce_loss": 0.10472335666418076, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "loss": 0.7077046632766724, + "step": 1880 + }, + { + "ce_loss": 0.2727658152580261, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "distill_loss": 0.12032405287027359, + "epoch": 0.6270847231487658, + "step": 1880 + }, + { + "epoch": 0.6270847231487658, + "ref_ce_loss": 0.19070985913276672, + "step": 1880 + }, + { + "epoch": 0.6304202801867912, + "loss": 0.7724, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "grad_norm": 3.7794764041900635, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "learning_rate": 0.00029813164106837805, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "loss": 0.8937748074531555, + "step": 1890 + }, + { + "ce_loss": 0.47305163741111755, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "distill_loss": 0.16846346855163574, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "ref_ce_loss": 0.15080559253692627, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "loss": 0.5890235304832458, + "step": 1890 + }, + { + "ce_loss": 0.23578114807605743, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "distill_loss": 0.15570217370986938, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "ref_ce_loss": 0.12087680399417877, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "loss": 0.9535101652145386, + "step": 1890 + }, + { + "ce_loss": 0.24146515130996704, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "distill_loss": 0.1533411592245102, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "ref_ce_loss": 0.11719386279582977, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "loss": 0.7761451005935669, + "step": 1890 + }, + { + "ce_loss": 0.22938808798789978, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "distill_loss": 0.12226255238056183, + "epoch": 0.6304202801867912, + "step": 1890 + }, + { + "epoch": 0.6304202801867912, + "ref_ce_loss": 0.13468782603740692, + "step": 1890 + }, + { + "epoch": 0.6337558372248165, + "loss": 0.7423, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "grad_norm": 2.9295172691345215, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "learning_rate": 0.0002980996347187977, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "loss": 0.6180705428123474, + "step": 1900 + }, + { + "ce_loss": 0.2857638895511627, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "distill_loss": 0.1499849557876587, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "ref_ce_loss": 0.13046589493751526, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "loss": 0.6940881609916687, + "step": 1900 + }, + { + "ce_loss": 0.27692747116088867, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "distill_loss": 0.1486843228340149, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "ref_ce_loss": 0.14957310259342194, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "loss": 0.5118992328643799, + "step": 1900 + }, + { + "ce_loss": 0.22834275662899017, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "distill_loss": 0.1341458261013031, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "ref_ce_loss": 0.14600332081317902, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "loss": 1.128795862197876, + "step": 1900 + }, + { + "ce_loss": 0.4237273335456848, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "distill_loss": 0.19581244885921478, + "epoch": 0.6337558372248165, + "step": 1900 + }, + { + "epoch": 0.6337558372248165, + "ref_ce_loss": 0.2021128535270691, + "step": 1900 + }, + { + "epoch": 0.6370913942628419, + "loss": 0.8588, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "grad_norm": 3.859954833984375, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "learning_rate": 0.00029806735829390415, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "loss": 1.5192310810089111, + "step": 1910 + }, + { + "ce_loss": 0.3459034264087677, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "distill_loss": 0.17199254035949707, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "ref_ce_loss": 0.229523167014122, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "loss": 1.2944340705871582, + "step": 1910 + }, + { + "ce_loss": 0.2584053575992584, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "distill_loss": 0.17071333527565002, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "ref_ce_loss": 0.1695045530796051, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "loss": 0.6828786134719849, + "step": 1910 + }, + { + "ce_loss": 0.31047752499580383, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "distill_loss": 0.1672109067440033, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "ref_ce_loss": 0.14789322018623352, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "loss": 0.5672518610954285, + "step": 1910 + }, + { + "ce_loss": 0.24659834802150726, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "distill_loss": 0.13652631640434265, + "epoch": 0.6370913942628419, + "step": 1910 + }, + { + "epoch": 0.6370913942628419, + "ref_ce_loss": 0.18395711481571198, + "step": 1910 + }, + { + "epoch": 0.6404269513008672, + "loss": 0.8946, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "grad_norm": 2.7981321811676025, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "learning_rate": 0.00029803481185255694, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "loss": 0.6429210901260376, + "step": 1920 + }, + { + "ce_loss": 0.28088638186454773, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "distill_loss": 0.16700200736522675, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "ref_ce_loss": 0.14392881095409393, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "loss": 1.208841323852539, + "step": 1920 + }, + { + "ce_loss": 0.3831193745136261, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "distill_loss": 0.1752890944480896, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "ref_ce_loss": 0.2569045424461365, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "loss": 1.2274980545043945, + "step": 1920 + }, + { + "ce_loss": 0.4168337285518646, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "distill_loss": 0.19316238164901733, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "ref_ce_loss": 0.16584856808185577, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "loss": 0.8652777671813965, + "step": 1920 + }, + { + "ce_loss": 0.36839842796325684, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "distill_loss": 0.1646052449941635, + "epoch": 0.6404269513008672, + "step": 1920 + }, + { + "epoch": 0.6404269513008672, + "ref_ce_loss": 0.2286727875471115, + "step": 1920 + }, + { + "epoch": 0.6437625083388926, + "loss": 0.8195, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "grad_norm": 3.6174919605255127, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "learning_rate": 0.00029800199545410787, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "loss": 0.5298856496810913, + "step": 1930 + }, + { + "ce_loss": 0.2486499845981598, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "distill_loss": 0.16173726320266724, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "ref_ce_loss": 0.1191975474357605, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "loss": 0.5920172333717346, + "step": 1930 + }, + { + "ce_loss": 0.17179836332798004, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "distill_loss": 0.213221475481987, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "ref_ce_loss": 0.14253775775432587, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "loss": 0.9216732978820801, + "step": 1930 + }, + { + "ce_loss": 0.2662244737148285, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "distill_loss": 0.20773352682590485, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "ref_ce_loss": 0.1405613273382187, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "loss": 0.5762615203857422, + "step": 1930 + }, + { + "ce_loss": 0.21583402156829834, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "distill_loss": 0.15604069828987122, + "epoch": 0.6437625083388926, + "step": 1930 + }, + { + "epoch": 0.6437625083388926, + "ref_ce_loss": 0.1498086303472519, + "step": 1930 + }, + { + "epoch": 0.6470980653769179, + "loss": 0.8172, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "grad_norm": 4.659943580627441, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "learning_rate": 0.0002979689091584011, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "loss": 0.5301949977874756, + "step": 1940 + }, + { + "ce_loss": 0.1708390712738037, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "distill_loss": 0.12711238861083984, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "ref_ce_loss": 0.13796593248844147, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "loss": 1.0410027503967285, + "step": 1940 + }, + { + "ce_loss": 0.37468576431274414, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "distill_loss": 0.17693452537059784, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "ref_ce_loss": 0.14826247096061707, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "loss": 0.7805930376052856, + "step": 1940 + }, + { + "ce_loss": 0.29428359866142273, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "distill_loss": 0.15377718210220337, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "ref_ce_loss": 0.18468515574932098, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "loss": 0.5379800200462341, + "step": 1940 + }, + { + "ce_loss": 0.25719043612480164, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "distill_loss": 0.14073030650615692, + "epoch": 0.6470980653769179, + "step": 1940 + }, + { + "epoch": 0.6470980653769179, + "ref_ce_loss": 0.13994979858398438, + "step": 1940 + }, + { + "epoch": 0.6504336224149433, + "loss": 0.7606, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "grad_norm": 2.321260929107666, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "learning_rate": 0.000297935553025773, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "loss": 0.5624473094940186, + "step": 1950 + }, + { + "ce_loss": 0.2591063976287842, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "distill_loss": 0.13360954821109772, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "ref_ce_loss": 0.16947798430919647, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "loss": 0.6944127082824707, + "step": 1950 + }, + { + "ce_loss": 0.3012009263038635, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "distill_loss": 0.16501295566558838, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "ref_ce_loss": 0.1470092236995697, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "loss": 0.7297142744064331, + "step": 1950 + }, + { + "ce_loss": 0.3316505253314972, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "distill_loss": 0.1685563027858734, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "ref_ce_loss": 0.14613375067710876, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "loss": 0.5265622735023499, + "step": 1950 + }, + { + "ce_loss": 0.1956728845834732, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "distill_loss": 0.15365520119667053, + "epoch": 0.6504336224149433, + "step": 1950 + }, + { + "epoch": 0.6504336224149433, + "ref_ce_loss": 0.17705023288726807, + "step": 1950 + }, + { + "epoch": 0.6537691794529686, + "loss": 0.8083, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "grad_norm": 2.691359758377075, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "learning_rate": 0.00029790192711705196, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "loss": 0.8530191779136658, + "step": 1960 + }, + { + "ce_loss": 0.3192751109600067, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "distill_loss": 0.18763959407806396, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "ref_ce_loss": 0.13116642832756042, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "loss": 0.8203089237213135, + "step": 1960 + }, + { + "ce_loss": 0.3802758455276489, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "distill_loss": 0.16498608887195587, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "ref_ce_loss": 0.2160835862159729, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "loss": 0.995650053024292, + "step": 1960 + }, + { + "ce_loss": 0.3241329789161682, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "distill_loss": 0.14752744138240814, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "ref_ce_loss": 0.17125451564788818, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "loss": 0.631495475769043, + "step": 1960 + }, + { + "ce_loss": 0.15118324756622314, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "distill_loss": 0.12421630322933197, + "epoch": 0.6537691794529686, + "step": 1960 + }, + { + "epoch": 0.6537691794529686, + "ref_ce_loss": 0.11524390429258347, + "step": 1960 + }, + { + "epoch": 0.657104736490994, + "loss": 0.7711, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "grad_norm": 2.697709798812866, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "learning_rate": 0.00029786803149355843, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "loss": 0.44634926319122314, + "step": 1970 + }, + { + "ce_loss": 0.18240369856357574, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "distill_loss": 0.15074263513088226, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "ref_ce_loss": 0.1105746328830719, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "loss": 0.5571528077125549, + "step": 1970 + }, + { + "ce_loss": 0.2294694185256958, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "distill_loss": 0.1737332046031952, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "ref_ce_loss": 0.14419282972812653, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "loss": 1.3893582820892334, + "step": 1970 + }, + { + "ce_loss": 0.3939763307571411, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "distill_loss": 0.1607140153646469, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "ref_ce_loss": 0.17446056008338928, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "loss": 0.8046218156814575, + "step": 1970 + }, + { + "ce_loss": 0.28628915548324585, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "distill_loss": 0.16188013553619385, + "epoch": 0.657104736490994, + "step": 1970 + }, + { + "epoch": 0.657104736490994, + "ref_ce_loss": 0.16178050637245178, + "step": 1970 + }, + { + "epoch": 0.6604402935290193, + "loss": 0.7564, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "grad_norm": 2.1782381534576416, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "learning_rate": 0.00029783386621710467, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "loss": 0.6052896976470947, + "step": 1980 + }, + { + "ce_loss": 0.21867820620536804, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "distill_loss": 0.12602409720420837, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "ref_ce_loss": 0.1373637467622757, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "loss": 1.0979907512664795, + "step": 1980 + }, + { + "ce_loss": 0.4194616973400116, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "distill_loss": 0.19360217452049255, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "ref_ce_loss": 0.21660290658473969, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "loss": 0.9994645118713379, + "step": 1980 + }, + { + "ce_loss": 0.26734447479248047, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "distill_loss": 0.14631450176239014, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "ref_ce_loss": 0.19095124304294586, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "loss": 1.1816197633743286, + "step": 1980 + }, + { + "ce_loss": 0.32602429389953613, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "distill_loss": 0.17026512324810028, + "epoch": 0.6604402935290193, + "step": 1980 + }, + { + "epoch": 0.6604402935290193, + "ref_ce_loss": 0.16700991988182068, + "step": 1980 + }, + { + "epoch": 0.6637758505670447, + "loss": 0.7854, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "grad_norm": 2.9347715377807617, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "learning_rate": 0.0002977994313499946, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "loss": 0.9755704402923584, + "step": 1990 + }, + { + "ce_loss": 0.26383841037750244, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "distill_loss": 0.10116223245859146, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "ref_ce_loss": 0.15708518028259277, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "loss": 0.5551912784576416, + "step": 1990 + }, + { + "ce_loss": 0.21296478807926178, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "distill_loss": 0.12776599824428558, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "ref_ce_loss": 0.10435923933982849, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "loss": 0.5524492859840393, + "step": 1990 + }, + { + "ce_loss": 0.26941555738449097, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "distill_loss": 0.11517900228500366, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "ref_ce_loss": 0.16757942736148834, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "loss": 0.5536325573921204, + "step": 1990 + }, + { + "ce_loss": 0.30605706572532654, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "distill_loss": 0.1278000921010971, + "epoch": 0.6637758505670447, + "step": 1990 + }, + { + "epoch": 0.6637758505670447, + "ref_ce_loss": 0.11955641955137253, + "step": 1990 + }, + { + "epoch": 0.66711140760507, + "loss": 0.9327, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "grad_norm": 7.14279317855835, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "learning_rate": 0.00029776472695502385, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "loss": 1.3240374326705933, + "step": 2000 + }, + { + "ce_loss": 0.14523351192474365, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "distill_loss": 0.945601224899292, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "ref_ce_loss": 0.13358724117279053, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "loss": 1.9700191020965576, + "step": 2000 + }, + { + "ce_loss": 0.37845736742019653, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "distill_loss": 1.0867323875427246, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "ref_ce_loss": 0.23238378763198853, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "loss": 1.733937382698059, + "step": 2000 + }, + { + "ce_loss": 0.30824780464172363, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "distill_loss": 0.8462389707565308, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "ref_ce_loss": 0.13755394518375397, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "loss": 1.9826970100402832, + "step": 2000 + }, + { + "ce_loss": 0.3606218099594116, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "distill_loss": 1.0254515409469604, + "epoch": 0.66711140760507, + "step": 2000 + }, + { + "epoch": 0.66711140760507, + "ref_ce_loss": 0.16944263875484467, + "step": 2000 + }, + { + "epoch": 0.6704469646430954, + "loss": 1.0835, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "grad_norm": 1.8303823471069336, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "learning_rate": 0.0002977297530954796, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "loss": 1.0600343942642212, + "step": 2010 + }, + { + "ce_loss": 0.22668348252773285, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "distill_loss": 0.2836839556694031, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "ref_ce_loss": 0.1294415295124054, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "loss": 0.828821063041687, + "step": 2010 + }, + { + "ce_loss": 0.24913601577281952, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "distill_loss": 0.24501857161521912, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "ref_ce_loss": 0.17393875122070312, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "loss": 1.1273013353347778, + "step": 2010 + }, + { + "ce_loss": 0.2586398124694824, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "distill_loss": 0.2831948399543762, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "ref_ce_loss": 0.18231160938739777, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "loss": 0.5938960909843445, + "step": 2010 + }, + { + "ce_loss": 0.2319275587797165, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "distill_loss": 0.22680506110191345, + "epoch": 0.6704469646430954, + "step": 2010 + }, + { + "epoch": 0.6704469646430954, + "ref_ce_loss": 0.1350928097963333, + "step": 2010 + }, + { + "epoch": 0.6737825216811207, + "loss": 0.9008, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "grad_norm": 2.8291239738464355, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "learning_rate": 0.0002976945098351403, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "loss": 0.590222954750061, + "step": 2020 + }, + { + "ce_loss": 0.27437344193458557, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "distill_loss": 0.17268088459968567, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "ref_ce_loss": 0.14304281771183014, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "loss": 0.8302665948867798, + "step": 2020 + }, + { + "ce_loss": 0.2411762773990631, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "distill_loss": 0.1841261237859726, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "ref_ce_loss": 0.1724471151828766, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "loss": 0.6391019821166992, + "step": 2020 + }, + { + "ce_loss": 0.19330915808677673, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "distill_loss": 0.16628727316856384, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "ref_ce_loss": 0.14351928234100342, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "loss": 0.9922801852226257, + "step": 2020 + }, + { + "ce_loss": 0.18819864094257355, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "distill_loss": 0.13531914353370667, + "epoch": 0.6737825216811207, + "step": 2020 + }, + { + "epoch": 0.6737825216811207, + "ref_ce_loss": 0.1615561842918396, + "step": 2020 + }, + { + "epoch": 0.6771180787191461, + "loss": 0.84, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "grad_norm": 3.3795111179351807, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "learning_rate": 0.00029765899723827575, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "loss": 0.8877922892570496, + "step": 2030 + }, + { + "ce_loss": 0.2900596261024475, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "distill_loss": 0.24294210970401764, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "ref_ce_loss": 0.17468880116939545, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "loss": 0.8432213068008423, + "step": 2030 + }, + { + "ce_loss": 0.3750610053539276, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "distill_loss": 0.22893035411834717, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "ref_ce_loss": 0.1852346807718277, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "loss": 0.6145089864730835, + "step": 2030 + }, + { + "ce_loss": 0.25153613090515137, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "distill_loss": 0.18749454617500305, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "ref_ce_loss": 0.1751817911863327, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "loss": 1.0318095684051514, + "step": 2030 + }, + { + "ce_loss": 0.17771191895008087, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "distill_loss": 0.17989817261695862, + "epoch": 0.6771180787191461, + "step": 2030 + }, + { + "epoch": 0.6771180787191461, + "ref_ce_loss": 0.12897759675979614, + "step": 2030 + }, + { + "epoch": 0.6804536357571714, + "loss": 0.7896, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "grad_norm": 3.249816656112671, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "learning_rate": 0.00029762321536964704, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "loss": 0.9905202388763428, + "step": 2040 + }, + { + "ce_loss": 0.37102267146110535, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "distill_loss": 0.21594317257404327, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "ref_ce_loss": 0.2302805334329605, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "loss": 0.8301817178726196, + "step": 2040 + }, + { + "ce_loss": 0.22139473259449005, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "distill_loss": 0.17883756756782532, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "ref_ce_loss": 0.15354622900485992, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "loss": 0.5395179390907288, + "step": 2040 + }, + { + "ce_loss": 0.2128942608833313, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "distill_loss": 0.1688617467880249, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "ref_ce_loss": 0.10727988183498383, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "loss": 0.8493964672088623, + "step": 2040 + }, + { + "ce_loss": 0.29065755009651184, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "distill_loss": 0.21070683002471924, + "epoch": 0.6804536357571714, + "step": 2040 + }, + { + "epoch": 0.6804536357571714, + "ref_ce_loss": 0.17182712256908417, + "step": 2040 + }, + { + "epoch": 0.6837891927951968, + "loss": 0.8114, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "grad_norm": 2.234649658203125, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "learning_rate": 0.0002975871642945061, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "loss": 1.5577614307403564, + "step": 2050 + }, + { + "ce_loss": 0.32842814922332764, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "distill_loss": 0.1840927004814148, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "ref_ce_loss": 0.1488945633172989, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "loss": 0.5667079091072083, + "step": 2050 + }, + { + "ce_loss": 0.1807955801486969, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "distill_loss": 0.16267137229442596, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "ref_ce_loss": 0.10108146071434021, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "loss": 0.35961073637008667, + "step": 2050 + }, + { + "ce_loss": 0.1420925408601761, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "distill_loss": 0.13304996490478516, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "ref_ce_loss": 0.08395658433437347, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "loss": 1.5085173845291138, + "step": 2050 + }, + { + "ce_loss": 0.2386820912361145, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "distill_loss": 0.1670990139245987, + "epoch": 0.6837891927951968, + "step": 2050 + }, + { + "epoch": 0.6837891927951968, + "ref_ce_loss": 0.16288962960243225, + "step": 2050 + }, + { + "epoch": 0.6871247498332221, + "loss": 0.8301, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "grad_norm": 4.133786201477051, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "learning_rate": 0.0002975508440785958, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "loss": 0.7842780351638794, + "step": 2060 + }, + { + "ce_loss": 0.36025092005729675, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "distill_loss": 0.1954689919948578, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "ref_ce_loss": 0.22843322157859802, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "loss": 0.7526856064796448, + "step": 2060 + }, + { + "ce_loss": 0.24765729904174805, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "distill_loss": 0.20760603249073029, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "ref_ce_loss": 0.19382600486278534, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "loss": 0.9192790985107422, + "step": 2060 + }, + { + "ce_loss": 0.2992182970046997, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "distill_loss": 0.20960591733455658, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "ref_ce_loss": 0.16578657925128937, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "loss": 0.7168542742729187, + "step": 2060 + }, + { + "ce_loss": 0.2953929603099823, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "distill_loss": 0.2222084403038025, + "epoch": 0.6871247498332221, + "step": 2060 + }, + { + "epoch": 0.6871247498332221, + "ref_ce_loss": 0.1362280398607254, + "step": 2060 + }, + { + "epoch": 0.6904603068712475, + "loss": 0.8254, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "grad_norm": 1.7999627590179443, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "learning_rate": 0.0002975142547881501, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "loss": 0.8302853107452393, + "step": 2070 + }, + { + "ce_loss": 0.3683020770549774, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "distill_loss": 0.14901624619960785, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "ref_ce_loss": 0.21112219989299774, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "loss": 0.8270956873893738, + "step": 2070 + }, + { + "ce_loss": 0.3249755799770355, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "distill_loss": 0.14936313033103943, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "ref_ce_loss": 0.17302238941192627, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "loss": 1.6876327991485596, + "step": 2070 + }, + { + "ce_loss": 0.30081063508987427, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "distill_loss": 0.16745862364768982, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "ref_ce_loss": 0.12626586854457855, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "loss": 0.8751262426376343, + "step": 2070 + }, + { + "ce_loss": 0.31323501467704773, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "distill_loss": 0.16205358505249023, + "epoch": 0.6904603068712475, + "step": 2070 + }, + { + "epoch": 0.6904603068712475, + "ref_ce_loss": 0.2028389722108841, + "step": 2070 + }, + { + "epoch": 0.6937958639092728, + "loss": 0.8444, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "grad_norm": 2.48443341255188, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "learning_rate": 0.00029747739648989315, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "loss": 0.7376833558082581, + "step": 2080 + }, + { + "ce_loss": 0.28451669216156006, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "distill_loss": 0.20127324759960175, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "ref_ce_loss": 0.1741037368774414, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "loss": 0.4693785607814789, + "step": 2080 + }, + { + "ce_loss": 0.16751381754875183, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "distill_loss": 0.15593096613883972, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "ref_ce_loss": 0.14585992693901062, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "loss": 1.0831847190856934, + "step": 2080 + }, + { + "ce_loss": 0.31920549273490906, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "distill_loss": 0.23243792355060577, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "ref_ce_loss": 0.1392020583152771, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "loss": 0.6108224391937256, + "step": 2080 + }, + { + "ce_loss": 0.19906829297542572, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "distill_loss": 0.21175572276115417, + "epoch": 0.6937958639092728, + "step": 2080 + }, + { + "epoch": 0.6937958639092728, + "ref_ce_loss": 0.12633490562438965, + "step": 2080 + }, + { + "epoch": 0.6971314209472982, + "loss": 1.1597, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "grad_norm": 3.2459542751312256, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "learning_rate": 0.00029744026925104014, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "loss": 0.8244901895523071, + "step": 2090 + }, + { + "ce_loss": 0.18612109124660492, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "distill_loss": 0.4257374703884125, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "ref_ce_loss": 0.12089783698320389, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "loss": 1.226898193359375, + "step": 2090 + }, + { + "ce_loss": 0.3660268485546112, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "distill_loss": 0.5295765995979309, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "ref_ce_loss": 0.16705910861492157, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "loss": 1.0499669313430786, + "step": 2090 + }, + { + "ce_loss": 0.26359203457832336, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "distill_loss": 0.5406015515327454, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "ref_ce_loss": 0.16747480630874634, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "loss": 0.9574893712997437, + "step": 2090 + }, + { + "ce_loss": 0.27059000730514526, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "distill_loss": 0.4332764446735382, + "epoch": 0.6971314209472982, + "step": 2090 + }, + { + "epoch": 0.6971314209472982, + "ref_ce_loss": 0.17012599110603333, + "step": 2090 + }, + { + "epoch": 0.7004669779853235, + "loss": 1.1214, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "grad_norm": 3.7141575813293457, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "learning_rate": 0.00029740287313929643, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "loss": 1.2868798971176147, + "step": 2100 + }, + { + "ce_loss": 0.24377526342868805, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "distill_loss": 0.5068783760070801, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "ref_ce_loss": 0.122403085231781, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "loss": 0.9823429584503174, + "step": 2100 + }, + { + "ce_loss": 0.27241188287734985, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "distill_loss": 0.39842548966407776, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "ref_ce_loss": 0.18590964376926422, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "loss": 1.0657988786697388, + "step": 2100 + }, + { + "ce_loss": 0.21215464174747467, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "distill_loss": 0.368913471698761, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "ref_ce_loss": 0.23388366401195526, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "loss": 0.9568293690681458, + "step": 2100 + }, + { + "ce_loss": 0.3097842335700989, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "distill_loss": 0.37886306643486023, + "epoch": 0.7004669779853235, + "step": 2100 + }, + { + "epoch": 0.7004669779853235, + "ref_ce_loss": 0.1720070242881775, + "step": 2100 + }, + { + "epoch": 0.7038025350233489, + "loss": 1.0944, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "grad_norm": 8.202859878540039, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "learning_rate": 0.0002973652082228578, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "loss": 1.5614635944366455, + "step": 2110 + }, + { + "ce_loss": 0.4321347177028656, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "distill_loss": 0.5474117994308472, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "ref_ce_loss": 0.17317543923854828, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "loss": 1.1091805696487427, + "step": 2110 + }, + { + "ce_loss": 0.19208572804927826, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "distill_loss": 0.3152616620063782, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "ref_ce_loss": 0.19635343551635742, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "loss": 1.0785329341888428, + "step": 2110 + }, + { + "ce_loss": 0.3584291338920593, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "distill_loss": 0.41182008385658264, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "ref_ce_loss": 0.14687897264957428, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "loss": 1.312581181526184, + "step": 2110 + }, + { + "ce_loss": 0.3514189124107361, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "distill_loss": 0.495456337928772, + "epoch": 0.7038025350233489, + "step": 2110 + }, + { + "epoch": 0.7038025350233489, + "ref_ce_loss": 0.14532673358917236, + "step": 2110 + }, + { + "epoch": 0.7071380920613742, + "loss": 1.0739, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "grad_norm": 3.721625566482544, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "learning_rate": 0.00029732727457041025, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "loss": 1.1425330638885498, + "step": 2120 + }, + { + "ce_loss": 0.362537145614624, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "distill_loss": 0.3849060535430908, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "ref_ce_loss": 0.17424069344997406, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "loss": 1.050082802772522, + "step": 2120 + }, + { + "ce_loss": 0.30090203881263733, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "distill_loss": 0.42408767342567444, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "ref_ce_loss": 0.10593853145837784, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "loss": 1.2173357009887695, + "step": 2120 + }, + { + "ce_loss": 0.2668623924255371, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "distill_loss": 0.39260539412498474, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "ref_ce_loss": 0.17890018224716187, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "loss": 1.525931715965271, + "step": 2120 + }, + { + "ce_loss": 0.35097646713256836, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "distill_loss": 0.341688334941864, + "epoch": 0.7071380920613742, + "step": 2120 + }, + { + "epoch": 0.7071380920613742, + "ref_ce_loss": 0.2002553790807724, + "step": 2120 + }, + { + "epoch": 0.7104736490993996, + "loss": 0.8325, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "grad_norm": 5.317857265472412, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "learning_rate": 0.0002972890722511297, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "loss": 0.9740656614303589, + "step": 2130 + }, + { + "ce_loss": 0.2215503603219986, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "distill_loss": 0.18275123834609985, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "ref_ce_loss": 0.18407899141311646, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "loss": 0.9693307876586914, + "step": 2130 + }, + { + "ce_loss": 0.33199867606163025, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "distill_loss": 0.2375098168849945, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "ref_ce_loss": 0.17215308547019958, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "loss": 1.586306095123291, + "step": 2130 + }, + { + "ce_loss": 0.32106271386146545, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "distill_loss": 0.1949656754732132, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "ref_ce_loss": 0.11499127000570297, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "loss": 0.836284875869751, + "step": 2130 + }, + { + "ce_loss": 0.3160724639892578, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "distill_loss": 0.1925450563430786, + "epoch": 0.7104736490993996, + "step": 2130 + }, + { + "epoch": 0.7104736490993996, + "ref_ce_loss": 0.24203264713287354, + "step": 2130 + }, + { + "epoch": 0.7138092061374249, + "loss": 0.7741, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "grad_norm": 2.827847719192505, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "learning_rate": 0.0002972506013346822, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "loss": 0.853046715259552, + "step": 2140 + }, + { + "ce_loss": 0.3299194574356079, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "distill_loss": 0.17669892311096191, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "ref_ce_loss": 0.13139741122722626, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "loss": 0.6886839270591736, + "step": 2140 + }, + { + "ce_loss": 0.3075639009475708, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "distill_loss": 0.18278264999389648, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "ref_ce_loss": 0.19807660579681396, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "loss": 0.6770578622817993, + "step": 2140 + }, + { + "ce_loss": 0.2196367233991623, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "distill_loss": 0.1661662459373474, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "ref_ce_loss": 0.12581881880760193, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "loss": 1.0704749822616577, + "step": 2140 + }, + { + "ce_loss": 0.3837515711784363, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "distill_loss": 0.18932490050792694, + "epoch": 0.7138092061374249, + "step": 2140 + }, + { + "epoch": 0.7138092061374249, + "ref_ce_loss": 0.19901344180107117, + "step": 2140 + }, + { + "epoch": 0.7171447631754503, + "loss": 0.7765, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "grad_norm": 2.2405152320861816, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "learning_rate": 0.00029721186189122346, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "loss": 0.521872878074646, + "step": 2150 + }, + { + "ce_loss": 0.20972879230976105, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "distill_loss": 0.1326482594013214, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "ref_ce_loss": 0.1792561411857605, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "loss": 0.7119396328926086, + "step": 2150 + }, + { + "ce_loss": 0.31725606322288513, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "distill_loss": 0.13686680793762207, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "ref_ce_loss": 0.18049372732639313, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "loss": 0.5622824430465698, + "step": 2150 + }, + { + "ce_loss": 0.207227885723114, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "distill_loss": 0.1053415834903717, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "ref_ce_loss": 0.17381590604782104, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "loss": 0.8207659721374512, + "step": 2150 + }, + { + "ce_loss": 0.24983304738998413, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "distill_loss": 0.1186215877532959, + "epoch": 0.7171447631754503, + "step": 2150 + }, + { + "epoch": 0.7171447631754503, + "ref_ce_loss": 0.20335406064987183, + "step": 2150 + }, + { + "epoch": 0.7204803202134756, + "loss": 0.739, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "grad_norm": 4.988354682922363, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "learning_rate": 0.000297172853991399, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "loss": 0.9101301431655884, + "step": 2160 + }, + { + "ce_loss": 0.23088371753692627, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "distill_loss": 0.14969925582408905, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "ref_ce_loss": 0.247276172041893, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "loss": 0.5818696022033691, + "step": 2160 + }, + { + "ce_loss": 0.21994711458683014, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "distill_loss": 0.19431249797344208, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "ref_ce_loss": 0.12076810002326965, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "loss": 0.6261471509933472, + "step": 2160 + }, + { + "ce_loss": 0.2857092618942261, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "distill_loss": 0.18585669994354248, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "ref_ce_loss": 0.1544153392314911, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "loss": 0.7494639754295349, + "step": 2160 + }, + { + "ce_loss": 0.26624763011932373, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "distill_loss": 0.15255224704742432, + "epoch": 0.7204803202134756, + "step": 2160 + }, + { + "epoch": 0.7204803202134756, + "ref_ce_loss": 0.13864634931087494, + "step": 2160 + }, + { + "epoch": 0.723815877251501, + "loss": 0.8542, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "grad_norm": 2.2944247722625732, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "learning_rate": 0.0002971335777063438, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "loss": 0.7885057926177979, + "step": 2170 + }, + { + "ce_loss": 0.2262822985649109, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "distill_loss": 0.22849422693252563, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "ref_ce_loss": 0.24023662507534027, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "loss": 1.0535857677459717, + "step": 2170 + }, + { + "ce_loss": 0.33788785338401794, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "distill_loss": 0.28586989641189575, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "ref_ce_loss": 0.1780318319797516, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "loss": 0.929999828338623, + "step": 2170 + }, + { + "ce_loss": 0.3797388970851898, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "distill_loss": 0.2834654748439789, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "ref_ce_loss": 0.1784849613904953, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "loss": 0.8898862600326538, + "step": 2170 + }, + { + "ce_loss": 0.3450695872306824, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "distill_loss": 0.3220275640487671, + "epoch": 0.723815877251501, + "step": 2170 + }, + { + "epoch": 0.723815877251501, + "ref_ce_loss": 0.16206231713294983, + "step": 2170 + }, + { + "epoch": 0.7271514342895263, + "loss": 0.9429, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "grad_norm": 2.309176445007324, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "learning_rate": 0.0002970940331076823, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "loss": 0.8022871017456055, + "step": 2180 + }, + { + "ce_loss": 0.3425973355770111, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "distill_loss": 0.2834654152393341, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "ref_ce_loss": 0.17585596442222595, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "loss": 0.9950260519981384, + "step": 2180 + }, + { + "ce_loss": 0.276937872171402, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "distill_loss": 0.3323085606098175, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "ref_ce_loss": 0.15872758626937866, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "loss": 1.056720495223999, + "step": 2180 + }, + { + "ce_loss": 0.25281429290771484, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "distill_loss": 0.3386659622192383, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "ref_ce_loss": 0.16279882192611694, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "loss": 0.8699762225151062, + "step": 2180 + }, + { + "ce_loss": 0.34646478295326233, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "distill_loss": 0.2966349720954895, + "epoch": 0.7271514342895263, + "step": 2180 + }, + { + "epoch": 0.7271514342895263, + "ref_ce_loss": 0.18022169172763824, + "step": 2180 + }, + { + "epoch": 0.7304869913275517, + "loss": 0.9007, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "grad_norm": 2.0588133335113525, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "learning_rate": 0.00029705422026752833, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "loss": 0.6151515245437622, + "step": 2190 + }, + { + "ce_loss": 0.23072300851345062, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "distill_loss": 0.16477593779563904, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "ref_ce_loss": 0.12058866024017334, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "loss": 0.6579670310020447, + "step": 2190 + }, + { + "ce_loss": 0.25194308161735535, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "distill_loss": 0.20696522295475006, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "ref_ce_loss": 0.11303284764289856, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "loss": 0.7196918725967407, + "step": 2190 + }, + { + "ce_loss": 0.27340036630630493, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "distill_loss": 0.22912757098674774, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "ref_ce_loss": 0.12707854807376862, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "loss": 0.6519283056259155, + "step": 2190 + }, + { + "ce_loss": 0.19143091142177582, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "distill_loss": 0.20538610219955444, + "epoch": 0.7304869913275517, + "step": 2190 + }, + { + "epoch": 0.7304869913275517, + "ref_ce_loss": 0.2145569622516632, + "step": 2190 + }, + { + "epoch": 0.733822548365577, + "loss": 0.813, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "grad_norm": 2.7598912715911865, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "learning_rate": 0.0002970141392584847, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "loss": 1.048766016960144, + "step": 2200 + }, + { + "ce_loss": 0.18640631437301636, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "distill_loss": 0.37723419070243835, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "ref_ce_loss": 0.16142550110816956, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "loss": 1.8987281322479248, + "step": 2200 + }, + { + "ce_loss": 0.4953133761882782, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "distill_loss": 0.496428906917572, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "ref_ce_loss": 0.294897198677063, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "loss": 1.2234065532684326, + "step": 2200 + }, + { + "ce_loss": 0.44955363869667053, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "distill_loss": 0.4747593402862549, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "ref_ce_loss": 0.2343982309103012, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "loss": 1.452303171157837, + "step": 2200 + }, + { + "ce_loss": 0.26678091287612915, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "distill_loss": 0.4596899449825287, + "epoch": 0.733822548365577, + "step": 2200 + }, + { + "epoch": 0.733822548365577, + "ref_ce_loss": 0.16570772230625153, + "step": 2200 + }, + { + "epoch": 0.7371581054036024, + "loss": 1.0477, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "grad_norm": 3.025998830795288, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "learning_rate": 0.00029697379015364343, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "loss": 0.9370204210281372, + "step": 2210 + }, + { + "ce_loss": 0.30269691348075867, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "distill_loss": 0.22978685796260834, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "ref_ce_loss": 0.2049352377653122, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "loss": 0.7113454341888428, + "step": 2210 + }, + { + "ce_loss": 0.1772550493478775, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "distill_loss": 0.2620844841003418, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "ref_ce_loss": 0.09602729231119156, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "loss": 1.1970373392105103, + "step": 2210 + }, + { + "ce_loss": 0.3331030607223511, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "distill_loss": 0.2646982967853546, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "ref_ce_loss": 0.10817735642194748, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "loss": 0.6757293343544006, + "step": 2210 + }, + { + "ce_loss": 0.2500683665275574, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "distill_loss": 0.2608494460582733, + "epoch": 0.7371581054036024, + "step": 2210 + }, + { + "epoch": 0.7371581054036024, + "ref_ce_loss": 0.16467423737049103, + "step": 2210 + }, + { + "epoch": 0.7404936624416277, + "loss": 0.7845, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "grad_norm": 2.9061050415039062, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "learning_rate": 0.00029693317302658534, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "loss": 0.7362913489341736, + "step": 2220 + }, + { + "ce_loss": 0.24955718219280243, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "distill_loss": 0.1837398260831833, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "ref_ce_loss": 0.1013365387916565, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "loss": 0.8126053810119629, + "step": 2220 + }, + { + "ce_loss": 0.31580850481987, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "distill_loss": 0.20748618245124817, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "ref_ce_loss": 0.1511477679014206, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "loss": 0.5393919944763184, + "step": 2220 + }, + { + "ce_loss": 0.18664191663265228, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "distill_loss": 0.1758815348148346, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "ref_ce_loss": 0.12390191853046417, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "loss": 0.7449082732200623, + "step": 2220 + }, + { + "ce_loss": 0.2927023470401764, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "distill_loss": 0.2008521407842636, + "epoch": 0.7404936624416277, + "step": 2220 + }, + { + "epoch": 0.7404936624416277, + "ref_ce_loss": 0.1507752686738968, + "step": 2220 + }, + { + "epoch": 0.7438292194796531, + "loss": 0.8107, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "grad_norm": 2.3651397228240967, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "learning_rate": 0.00029689228795138, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "loss": 0.6003869771957397, + "step": 2230 + }, + { + "ce_loss": 0.22887085378170013, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "distill_loss": 0.17210716009140015, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "ref_ce_loss": 0.14253908395767212, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "loss": 0.662057101726532, + "step": 2230 + }, + { + "ce_loss": 0.19098782539367676, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "distill_loss": 0.19936403632164001, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "ref_ce_loss": 0.08553535491228104, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "loss": 0.5438237190246582, + "step": 2230 + }, + { + "ce_loss": 0.1906469613313675, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "distill_loss": 0.1763281524181366, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "ref_ce_loss": 0.12854325771331787, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "loss": 0.7014032006263733, + "step": 2230 + }, + { + "ce_loss": 0.2834837734699249, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "distill_loss": 0.21160462498664856, + "epoch": 0.7438292194796531, + "step": 2230 + }, + { + "epoch": 0.7438292194796531, + "ref_ce_loss": 0.11291535943746567, + "step": 2230 + }, + { + "epoch": 0.7471647765176784, + "loss": 0.8327, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "grad_norm": 4.035131454467773, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "learning_rate": 0.0002968511350025858, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "loss": 0.791498064994812, + "step": 2240 + }, + { + "ce_loss": 0.24079382419586182, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "distill_loss": 0.31047412753105164, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "ref_ce_loss": 0.15600360929965973, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "loss": 0.8515423536300659, + "step": 2240 + }, + { + "ce_loss": 0.24841324985027313, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "distill_loss": 0.28120777010917664, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "ref_ce_loss": 0.25041747093200684, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "loss": 0.8282434940338135, + "step": 2240 + }, + { + "ce_loss": 0.32026177644729614, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "distill_loss": 0.3410600423812866, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "ref_ce_loss": 0.12167643010616302, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "loss": 0.8194974064826965, + "step": 2240 + }, + { + "ce_loss": 0.2856970429420471, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "distill_loss": 0.3313988149166107, + "epoch": 0.7471647765176784, + "step": 2240 + }, + { + "epoch": 0.7471647765176784, + "ref_ce_loss": 0.20230844616889954, + "step": 2240 + }, + { + "epoch": 0.7505003335557038, + "loss": 0.8899, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "grad_norm": 2.862393617630005, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "learning_rate": 0.0002968097142552494, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "loss": 0.7680596113204956, + "step": 2250 + }, + { + "ce_loss": 0.29027098417282104, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "distill_loss": 0.15521329641342163, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "ref_ce_loss": 0.23092901706695557, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "loss": 0.5187767148017883, + "step": 2250 + }, + { + "ce_loss": 0.20998996496200562, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "distill_loss": 0.1527378261089325, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "ref_ce_loss": 0.1060512512922287, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "loss": 1.1777644157409668, + "step": 2250 + }, + { + "ce_loss": 0.25235748291015625, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "distill_loss": 0.17757558822631836, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "ref_ce_loss": 0.21147017180919647, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "loss": 0.701378345489502, + "step": 2250 + }, + { + "ce_loss": 0.32881441712379456, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "distill_loss": 0.1808972954750061, + "epoch": 0.7505003335557038, + "step": 2250 + }, + { + "epoch": 0.7505003335557038, + "ref_ce_loss": 0.12013345956802368, + "step": 2250 + }, + { + "epoch": 0.7538358905937291, + "loss": 0.8455, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "grad_norm": 1.874451994895935, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "learning_rate": 0.0002967680257849059, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "loss": 0.6945121884346008, + "step": 2260 + }, + { + "ce_loss": 0.33384138345718384, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "distill_loss": 0.17329035699367523, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "ref_ce_loss": 0.18721669912338257, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "loss": 0.754318118095398, + "step": 2260 + }, + { + "ce_loss": 0.2305452823638916, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "distill_loss": 0.14365842938423157, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "ref_ce_loss": 0.12453807890415192, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "loss": 1.0577518939971924, + "step": 2260 + }, + { + "ce_loss": 0.22482116520404816, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "distill_loss": 0.15167245268821716, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "ref_ce_loss": 0.1493624448776245, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "loss": 1.0203830003738403, + "step": 2260 + }, + { + "ce_loss": 0.28023025393486023, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "distill_loss": 0.17490828037261963, + "epoch": 0.7538358905937291, + "step": 2260 + }, + { + "epoch": 0.7538358905937291, + "ref_ce_loss": 0.21855275332927704, + "step": 2260 + }, + { + "epoch": 0.7571714476317545, + "loss": 0.7753, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "grad_norm": 3.631338357925415, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "learning_rate": 0.00029672606966757854, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "loss": 0.7592071890830994, + "step": 2270 + }, + { + "ce_loss": 0.3136902153491974, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "distill_loss": 0.14364375174045563, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "ref_ce_loss": 0.22031280398368835, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "loss": 1.3026163578033447, + "step": 2270 + }, + { + "ce_loss": 0.3515109717845917, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "distill_loss": 0.14750359952449799, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "ref_ce_loss": 0.2071472853422165, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "loss": 1.009307861328125, + "step": 2270 + }, + { + "ce_loss": 0.27528730034828186, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "distill_loss": 0.15073201060295105, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "ref_ce_loss": 0.17023694515228271, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "loss": 1.061471939086914, + "step": 2270 + }, + { + "ce_loss": 0.3027641773223877, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "distill_loss": 0.14477810263633728, + "epoch": 0.7571714476317545, + "step": 2270 + }, + { + "epoch": 0.7571714476317545, + "ref_ce_loss": 0.1614489108324051, + "step": 2270 + }, + { + "epoch": 0.7605070046697798, + "loss": 0.7498, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "grad_norm": 2.607236862182617, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "learning_rate": 0.0002966838459797789, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "loss": 0.7864717245101929, + "step": 2280 + }, + { + "ce_loss": 0.27395790815353394, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "distill_loss": 0.19188761711120605, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "ref_ce_loss": 0.16352500021457672, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "loss": 0.7618268728256226, + "step": 2280 + }, + { + "ce_loss": 0.3203011155128479, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "distill_loss": 0.2293919324874878, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "ref_ce_loss": 0.1524127572774887, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "loss": 0.6663198471069336, + "step": 2280 + }, + { + "ce_loss": 0.282576322555542, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "distill_loss": 0.24836762249469757, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "ref_ce_loss": 0.1351906657218933, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "loss": 0.6264198422431946, + "step": 2280 + }, + { + "ce_loss": 0.23629958927631378, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "distill_loss": 0.18736609816551208, + "epoch": 0.7605070046697798, + "step": 2280 + }, + { + "epoch": 0.7605070046697798, + "ref_ce_loss": 0.20251253247261047, + "step": 2280 + }, + { + "epoch": 0.7638425617078052, + "loss": 0.7769, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "grad_norm": 3.2813711166381836, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "learning_rate": 0.0002966413547985062, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "loss": 1.0629838705062866, + "step": 2290 + }, + { + "ce_loss": 0.34151533246040344, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "distill_loss": 0.21097716689109802, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "ref_ce_loss": 0.17457735538482666, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "loss": 0.6704175472259521, + "step": 2290 + }, + { + "ce_loss": 0.2535441517829895, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "distill_loss": 0.1876956969499588, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "ref_ce_loss": 0.15311689674854279, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "loss": 0.6135028600692749, + "step": 2290 + }, + { + "ce_loss": 0.19847552478313446, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "distill_loss": 0.18787772953510284, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "ref_ce_loss": 0.14677968621253967, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "loss": 1.4862701892852783, + "step": 2290 + }, + { + "ce_loss": 0.26155686378479004, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "distill_loss": 0.2060234695672989, + "epoch": 0.7638425617078052, + "step": 2290 + }, + { + "epoch": 0.7638425617078052, + "ref_ce_loss": 0.14456400275230408, + "step": 2290 + }, + { + "epoch": 0.7671781187458305, + "loss": 0.7509, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "grad_norm": 2.5237789154052734, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "learning_rate": 0.0002965985962012477, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "loss": 0.5957300066947937, + "step": 2300 + }, + { + "ce_loss": 0.26703551411628723, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "distill_loss": 0.1350327730178833, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "ref_ce_loss": 0.1326143741607666, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "loss": 0.7044022083282471, + "step": 2300 + }, + { + "ce_loss": 0.17931139469146729, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "distill_loss": 0.14432227611541748, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "ref_ce_loss": 0.16795673966407776, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "loss": 0.8188800811767578, + "step": 2300 + }, + { + "ce_loss": 0.29290133714675903, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "distill_loss": 0.1588171422481537, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "ref_ce_loss": 0.1536998599767685, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "loss": 0.708327054977417, + "step": 2300 + }, + { + "ce_loss": 0.32811471819877625, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "distill_loss": 0.17735375463962555, + "epoch": 0.7671781187458305, + "step": 2300 + }, + { + "epoch": 0.7671781187458305, + "ref_ce_loss": 0.1569249927997589, + "step": 2300 + }, + { + "epoch": 0.7705136757838559, + "loss": 0.8038, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "grad_norm": 2.1413021087646484, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "learning_rate": 0.00029655557026597815, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "loss": 0.6469117999076843, + "step": 2310 + }, + { + "ce_loss": 0.22463861107826233, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "distill_loss": 0.13765175640583038, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "ref_ce_loss": 0.18833401799201965, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "loss": 0.8655230402946472, + "step": 2310 + }, + { + "ce_loss": 0.3175293803215027, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "distill_loss": 0.1456879824399948, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "ref_ce_loss": 0.1650840938091278, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "loss": 0.7164068818092346, + "step": 2310 + }, + { + "ce_loss": 0.2648800313472748, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "distill_loss": 0.17314201593399048, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "ref_ce_loss": 0.1560521125793457, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "loss": 0.4807608723640442, + "step": 2310 + }, + { + "ce_loss": 0.1812056303024292, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "distill_loss": 0.12476127594709396, + "epoch": 0.7705136757838559, + "step": 2310 + }, + { + "epoch": 0.7705136757838559, + "ref_ce_loss": 0.12248016893863678, + "step": 2310 + }, + { + "epoch": 0.7738492328218812, + "loss": 0.7234, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "grad_norm": 1.9185909032821655, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "learning_rate": 0.0002965122770711599, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "loss": 0.4447229206562042, + "step": 2320 + }, + { + "ce_loss": 0.17661221325397491, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "distill_loss": 0.15855096280574799, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "ref_ce_loss": 0.10898029804229736, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "loss": 1.2085399627685547, + "step": 2320 + }, + { + "ce_loss": 0.2651851773262024, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "distill_loss": 0.13599959015846252, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "ref_ce_loss": 0.131027951836586, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "loss": 0.9505325555801392, + "step": 2320 + }, + { + "ce_loss": 0.27964797616004944, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "distill_loss": 0.1719699501991272, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "ref_ce_loss": 0.12308228015899658, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "loss": 0.9700384140014648, + "step": 2320 + }, + { + "ce_loss": 0.5339371562004089, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "distill_loss": 0.22477610409259796, + "epoch": 0.7738492328218812, + "step": 2320 + }, + { + "epoch": 0.7738492328218812, + "ref_ce_loss": 0.15580026805400848, + "step": 2320 + }, + { + "epoch": 0.7771847898599066, + "loss": 0.7795, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "grad_norm": 2.1152358055114746, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "learning_rate": 0.00029646871669574256, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "loss": 0.6992760896682739, + "step": 2330 + }, + { + "ce_loss": 0.2329271286725998, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "distill_loss": 0.11824934929609299, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "ref_ce_loss": 0.16220200061798096, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "loss": 0.6952053904533386, + "step": 2330 + }, + { + "ce_loss": 0.3089507818222046, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "distill_loss": 0.13048169016838074, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "ref_ce_loss": 0.1320132315158844, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "loss": 0.9603186845779419, + "step": 2330 + }, + { + "ce_loss": 0.3964546322822571, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "distill_loss": 0.13962863385677338, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "ref_ce_loss": 0.20740242302417755, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "loss": 1.281550407409668, + "step": 2330 + }, + { + "ce_loss": 0.24422332644462585, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "distill_loss": 0.10252149403095245, + "epoch": 0.7771847898599066, + "step": 2330 + }, + { + "epoch": 0.7771847898599066, + "ref_ce_loss": 0.15284155309200287, + "step": 2330 + }, + { + "epoch": 0.7805203468979319, + "loss": 0.7734, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "grad_norm": 2.945228099822998, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "learning_rate": 0.00029642488921916325, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "loss": 0.5152295827865601, + "step": 2340 + }, + { + "ce_loss": 0.21698597073554993, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "distill_loss": 0.15574845671653748, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "ref_ce_loss": 0.14177797734737396, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "loss": 0.6439272165298462, + "step": 2340 + }, + { + "ce_loss": 0.23492102324962616, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "distill_loss": 0.1553235501050949, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "ref_ce_loss": 0.11704672873020172, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "loss": 0.6790904998779297, + "step": 2340 + }, + { + "ce_loss": 0.298139363527298, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "distill_loss": 0.1888904869556427, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "ref_ce_loss": 0.19122987985610962, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "loss": 0.5988003015518188, + "step": 2340 + }, + { + "ce_loss": 0.1609693169593811, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "distill_loss": 0.15114513039588928, + "epoch": 0.7805203468979319, + "step": 2340 + }, + { + "epoch": 0.7805203468979319, + "ref_ce_loss": 0.18135370314121246, + "step": 2340 + }, + { + "epoch": 0.7838559039359573, + "loss": 0.7991, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "grad_norm": 3.823965311050415, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "learning_rate": 0.0002963807947213458, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "loss": 0.8651024103164673, + "step": 2350 + }, + { + "ce_loss": 0.22501082718372345, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "distill_loss": 0.1862696409225464, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "ref_ce_loss": 0.10980051010847092, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "loss": 0.550654947757721, + "step": 2350 + }, + { + "ce_loss": 0.2535894811153412, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "distill_loss": 0.1499275267124176, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "ref_ce_loss": 0.11346378922462463, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "loss": 0.6153100728988647, + "step": 2350 + }, + { + "ce_loss": 0.2186146378517151, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "distill_loss": 0.14961737394332886, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "ref_ce_loss": 0.1576690971851349, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "loss": 0.7601372599601746, + "step": 2350 + }, + { + "ce_loss": 0.2906789481639862, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "distill_loss": 0.16383947432041168, + "epoch": 0.7838559039359573, + "step": 2350 + }, + { + "epoch": 0.7838559039359573, + "ref_ce_loss": 0.20926496386528015, + "step": 2350 + }, + { + "epoch": 0.7871914609739826, + "loss": 0.7316, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "grad_norm": 2.4781031608581543, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "learning_rate": 0.0002963364332827014, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "loss": 0.6347142457962036, + "step": 2360 + }, + { + "ce_loss": 0.20326924324035645, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "distill_loss": 0.14620020985603333, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "ref_ce_loss": 0.14227043092250824, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "loss": 1.0564327239990234, + "step": 2360 + }, + { + "ce_loss": 0.3296639025211334, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "distill_loss": 0.20934753119945526, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "ref_ce_loss": 0.1660376489162445, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "loss": 0.7564572691917419, + "step": 2360 + }, + { + "ce_loss": 0.23574452102184296, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "distill_loss": 0.16113722324371338, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "ref_ce_loss": 0.18381242454051971, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "loss": 1.2118096351623535, + "step": 2360 + }, + { + "ce_loss": 0.27346286177635193, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "distill_loss": 0.222482368350029, + "epoch": 0.7871914609739826, + "step": 2360 + }, + { + "epoch": 0.7871914609739826, + "ref_ce_loss": 0.17169533669948578, + "step": 2360 + }, + { + "epoch": 0.790527018012008, + "loss": 0.7116, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "grad_norm": 2.3116695880889893, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "learning_rate": 0.00029629180498412765, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "loss": 0.7551827430725098, + "step": 2370 + }, + { + "ce_loss": 0.21025662124156952, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "distill_loss": 0.10454167425632477, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "ref_ce_loss": 0.1653343290090561, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "loss": 0.3096018135547638, + "step": 2370 + }, + { + "ce_loss": 0.13652345538139343, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "distill_loss": 0.09901779890060425, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "ref_ce_loss": 0.07386656105518341, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "loss": 0.5769529938697815, + "step": 2370 + }, + { + "ce_loss": 0.2886972427368164, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "distill_loss": 0.11762228608131409, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "ref_ce_loss": 0.1701674461364746, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "loss": 0.8072974681854248, + "step": 2370 + }, + { + "ce_loss": 0.3220660090446472, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "distill_loss": 0.12519507110118866, + "epoch": 0.790527018012008, + "step": 2370 + }, + { + "epoch": 0.790527018012008, + "ref_ce_loss": 0.20523789525032043, + "step": 2370 + }, + { + "epoch": 0.7938625750500333, + "loss": 0.7613, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "grad_norm": 4.6878509521484375, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "learning_rate": 0.00029624690990700907, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "loss": 0.6524088382720947, + "step": 2380 + }, + { + "ce_loss": 0.21513007581233978, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "distill_loss": 0.24648374319076538, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "ref_ce_loss": 0.13517114520072937, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "loss": 1.555312156677246, + "step": 2380 + }, + { + "ce_loss": 0.3172930181026459, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "distill_loss": 0.18661177158355713, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "ref_ce_loss": 0.24270252883434296, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "loss": 0.7257373332977295, + "step": 2380 + }, + { + "ce_loss": 0.28231558203697205, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "distill_loss": 0.22690123319625854, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "ref_ce_loss": 0.15161336958408356, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "loss": 0.7957109212875366, + "step": 2380 + }, + { + "ce_loss": 0.23962001502513885, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "distill_loss": 0.22382575273513794, + "epoch": 0.7938625750500333, + "step": 2380 + }, + { + "epoch": 0.7938625750500333, + "ref_ce_loss": 0.21372731029987335, + "step": 2380 + }, + { + "epoch": 0.7971981320880587, + "loss": 0.8852, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "grad_norm": 2.4466781616210938, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "learning_rate": 0.00029620174813321646, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "loss": 0.736382782459259, + "step": 2390 + }, + { + "ce_loss": 0.2084326297044754, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "distill_loss": 0.2203093320131302, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "ref_ce_loss": 0.14905014634132385, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "loss": 0.6536170840263367, + "step": 2390 + }, + { + "ce_loss": 0.19936946034431458, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "distill_loss": 0.1701754927635193, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "ref_ce_loss": 0.14872415363788605, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "loss": 0.7685835361480713, + "step": 2390 + }, + { + "ce_loss": 0.2817256450653076, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "distill_loss": 0.1766086220741272, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "ref_ce_loss": 0.14955390989780426, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "loss": 1.0945006608963013, + "step": 2390 + }, + { + "ce_loss": 0.3128070533275604, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "distill_loss": 0.24318374693393707, + "epoch": 0.7971981320880587, + "step": 2390 + }, + { + "epoch": 0.7971981320880587, + "ref_ce_loss": 0.20368048548698425, + "step": 2390 + }, + { + "epoch": 0.800533689126084, + "loss": 0.7957, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "grad_norm": 3.198331832885742, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "learning_rate": 0.0002961563197451072, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "loss": 1.1640307903289795, + "step": 2400 + }, + { + "ce_loss": 0.2592827081680298, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "distill_loss": 0.17209471762180328, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "ref_ce_loss": 0.13521651923656464, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "loss": 0.9015942811965942, + "step": 2400 + }, + { + "ce_loss": 0.22579120099544525, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "distill_loss": 0.1696089655160904, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "ref_ce_loss": 0.0945950597524643, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "loss": 1.1169930696487427, + "step": 2400 + }, + { + "ce_loss": 0.24338200688362122, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "distill_loss": 0.17723533511161804, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "ref_ce_loss": 0.18651683628559113, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "loss": 0.8603613376617432, + "step": 2400 + }, + { + "ce_loss": 0.3572869300842285, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "distill_loss": 0.16341561079025269, + "epoch": 0.800533689126084, + "step": 2400 + }, + { + "epoch": 0.800533689126084, + "ref_ce_loss": 0.2452453076839447, + "step": 2400 + }, + { + "epoch": 0.8038692461641094, + "loss": 0.8002, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "grad_norm": 2.273329973220825, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "learning_rate": 0.00029611062482552464, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "loss": 1.0640661716461182, + "step": 2410 + }, + { + "ce_loss": 0.30567464232444763, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "distill_loss": 0.20861530303955078, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "ref_ce_loss": 0.22182579338550568, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "loss": 0.7910094261169434, + "step": 2410 + }, + { + "ce_loss": 0.2559601664543152, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "distill_loss": 0.19917668402194977, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "ref_ce_loss": 0.1583707183599472, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "loss": 0.6116624474525452, + "step": 2410 + }, + { + "ce_loss": 0.21384522318840027, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "distill_loss": 0.17438215017318726, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "ref_ce_loss": 0.12977702915668488, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "loss": 0.8569213151931763, + "step": 2410 + }, + { + "ce_loss": 0.40515777468681335, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "distill_loss": 0.2386610209941864, + "epoch": 0.8038692461641094, + "step": 2410 + }, + { + "epoch": 0.8038692461641094, + "ref_ce_loss": 0.16905608773231506, + "step": 2410 + }, + { + "epoch": 0.8072048032021347, + "loss": 0.7839, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "grad_norm": 2.3425471782684326, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "learning_rate": 0.0002960646634577983, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "loss": 0.8587204813957214, + "step": 2420 + }, + { + "ce_loss": 0.2946504056453705, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "distill_loss": 0.26324498653411865, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "ref_ce_loss": 0.17262223362922668, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "loss": 1.2969701290130615, + "step": 2420 + }, + { + "ce_loss": 0.22363796830177307, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "distill_loss": 0.19747430086135864, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "ref_ce_loss": 0.13005143404006958, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "loss": 0.711165189743042, + "step": 2420 + }, + { + "ce_loss": 0.17365582287311554, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "distill_loss": 0.20176240801811218, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "ref_ce_loss": 0.10909857600927353, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "loss": 0.7779433727264404, + "step": 2420 + }, + { + "ce_loss": 0.20582044124603271, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "distill_loss": 0.2517743706703186, + "epoch": 0.8072048032021347, + "step": 2420 + }, + { + "epoch": 0.8072048032021347, + "ref_ce_loss": 0.17426325380802155, + "step": 2420 + }, + { + "epoch": 0.8105403602401601, + "loss": 0.8096, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "grad_norm": 1.9124221801757812, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "learning_rate": 0.00029601843572574373, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "loss": 0.6110777854919434, + "step": 2430 + }, + { + "ce_loss": 0.21673980355262756, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "distill_loss": 0.19520121812820435, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "ref_ce_loss": 0.08564713597297668, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "loss": 0.7680901885032654, + "step": 2430 + }, + { + "ce_loss": 0.31637847423553467, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "distill_loss": 0.1756715029478073, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "ref_ce_loss": 0.13785050809383392, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "loss": 0.7048861980438232, + "step": 2430 + }, + { + "ce_loss": 0.28358614444732666, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "distill_loss": 0.16370432078838348, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "ref_ce_loss": 0.1661026030778885, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "loss": 0.6960597038269043, + "step": 2430 + }, + { + "ce_loss": 0.25172582268714905, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "distill_loss": 0.15557169914245605, + "epoch": 0.8105403602401601, + "step": 2430 + }, + { + "epoch": 0.8105403602401601, + "ref_ce_loss": 0.12519201636314392, + "step": 2430 + }, + { + "epoch": 0.8138759172781854, + "loss": 0.8606, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "grad_norm": 2.105224609375, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "learning_rate": 0.0002959719417136619, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "loss": 0.6570507287979126, + "step": 2440 + }, + { + "ce_loss": 0.22971493005752563, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "distill_loss": 0.20155471563339233, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "ref_ce_loss": 0.17011815309524536, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "loss": 0.8933101296424866, + "step": 2440 + }, + { + "ce_loss": 0.3815864324569702, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "distill_loss": 0.2659887373447418, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "ref_ce_loss": 0.18426789343357086, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "loss": 0.7041268944740295, + "step": 2440 + }, + { + "ce_loss": 0.2709902822971344, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "distill_loss": 0.2296167016029358, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "ref_ce_loss": 0.1552482694387436, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "loss": 0.75736004114151, + "step": 2440 + }, + { + "ce_loss": 0.2580389678478241, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "distill_loss": 0.21966888010501862, + "epoch": 0.8138759172781854, + "step": 2440 + }, + { + "epoch": 0.8138759172781854, + "ref_ce_loss": 0.18772462010383606, + "step": 2440 + }, + { + "epoch": 0.8172114743162108, + "loss": 0.76, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "grad_norm": 2.26934814453125, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "learning_rate": 0.00029592518150633963, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "loss": 0.6550593972206116, + "step": 2450 + }, + { + "ce_loss": 0.2627783417701721, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "distill_loss": 0.17369389533996582, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "ref_ce_loss": 0.16940702497959137, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "loss": 0.5833581686019897, + "step": 2450 + }, + { + "ce_loss": 0.2363595813512802, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "distill_loss": 0.14457328617572784, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "ref_ce_loss": 0.13348720967769623, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "loss": 0.8066520690917969, + "step": 2450 + }, + { + "ce_loss": 0.2194216549396515, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "distill_loss": 0.16594922542572021, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "ref_ce_loss": 0.20511534810066223, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "loss": 0.4526219964027405, + "step": 2450 + }, + { + "ce_loss": 0.1525576412677765, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "distill_loss": 0.1163424700498581, + "epoch": 0.8172114743162108, + "step": 2450 + }, + { + "epoch": 0.8172114743162108, + "ref_ce_loss": 0.12026916444301605, + "step": 2450 + }, + { + "epoch": 0.8205470313542361, + "loss": 0.7301, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "grad_norm": 2.068974494934082, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "learning_rate": 0.00029587815518904907, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "loss": 0.7143868207931519, + "step": 2460 + }, + { + "ce_loss": 0.33941391110420227, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "distill_loss": 0.2006298154592514, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "ref_ce_loss": 0.1742485612630844, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "loss": 0.5091679692268372, + "step": 2460 + }, + { + "ce_loss": 0.21453019976615906, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "distill_loss": 0.13742676377296448, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "ref_ce_loss": 0.1569163054227829, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "loss": 0.7609373331069946, + "step": 2460 + }, + { + "ce_loss": 0.26815611124038696, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "distill_loss": 0.14041054248809814, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "ref_ce_loss": 0.14617282152175903, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "loss": 0.5668540000915527, + "step": 2460 + }, + { + "ce_loss": 0.24641276895999908, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "distill_loss": 0.14012645184993744, + "epoch": 0.8205470313542361, + "step": 2460 + }, + { + "epoch": 0.8205470313542361, + "ref_ce_loss": 0.18027812242507935, + "step": 2460 + }, + { + "epoch": 0.8238825883922615, + "loss": 0.7348, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "grad_norm": 2.643747568130493, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "learning_rate": 0.00029583086284754766, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "loss": 0.6161603927612305, + "step": 2470 + }, + { + "ce_loss": 0.25048938393592834, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "distill_loss": 0.12217261642217636, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "ref_ce_loss": 0.17147208750247955, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "loss": 0.5763737559318542, + "step": 2470 + }, + { + "ce_loss": 0.21852903068065643, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "distill_loss": 0.13863025605678558, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "ref_ce_loss": 0.12992407381534576, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "loss": 1.011842131614685, + "step": 2470 + }, + { + "ce_loss": 0.22845333814620972, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "distill_loss": 0.10414294898509979, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "ref_ce_loss": 0.11669415980577469, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "loss": 0.7752334475517273, + "step": 2470 + }, + { + "ce_loss": 0.2148449867963791, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "distill_loss": 0.1058155745267868, + "epoch": 0.8238825883922615, + "step": 2470 + }, + { + "epoch": 0.8238825883922615, + "ref_ce_loss": 0.16419118642807007, + "step": 2470 + }, + { + "epoch": 0.8272181454302868, + "loss": 0.7157, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "grad_norm": 2.1814582347869873, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "learning_rate": 0.00029578330456807804, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "loss": 1.1222140789031982, + "step": 2480 + }, + { + "ce_loss": 0.21106146275997162, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "distill_loss": 0.15598347783088684, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "ref_ce_loss": 0.1223301962018013, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "loss": 1.7699038982391357, + "step": 2480 + }, + { + "ce_loss": 0.2586840093135834, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "distill_loss": 0.16339309513568878, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "ref_ce_loss": 0.18896687030792236, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "loss": 0.6121606826782227, + "step": 2480 + }, + { + "ce_loss": 0.2650245726108551, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "distill_loss": 0.1810838282108307, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "ref_ce_loss": 0.11699043959379196, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "loss": 0.751914381980896, + "step": 2480 + }, + { + "ce_loss": 0.3102262616157532, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "distill_loss": 0.15393638610839844, + "epoch": 0.8272181454302868, + "step": 2480 + }, + { + "epoch": 0.8272181454302868, + "ref_ce_loss": 0.16068010032176971, + "step": 2480 + }, + { + "epoch": 0.8305537024683122, + "loss": 0.7792, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "grad_norm": 1.7032862901687622, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "learning_rate": 0.0002957354804373677, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "loss": 0.46619075536727905, + "step": 2490 + }, + { + "ce_loss": 0.19643811881542206, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "distill_loss": 0.1185787171125412, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "ref_ce_loss": 0.150973379611969, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "loss": 0.5043961405754089, + "step": 2490 + }, + { + "ce_loss": 0.13823407888412476, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "distill_loss": 0.0923272967338562, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "ref_ce_loss": 0.09950131922960281, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "loss": 0.5879054069519043, + "step": 2490 + }, + { + "ce_loss": 0.23579871654510498, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "distill_loss": 0.14946657419204712, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "ref_ce_loss": 0.10562227666378021, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "loss": 0.7153053879737854, + "step": 2490 + }, + { + "ce_loss": 0.3567087650299072, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "distill_loss": 0.13327312469482422, + "epoch": 0.8305537024683122, + "step": 2490 + }, + { + "epoch": 0.8305537024683122, + "ref_ce_loss": 0.15352420508861542, + "step": 2490 + }, + { + "epoch": 0.8338892595063375, + "loss": 0.7088, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "grad_norm": 3.5686287879943848, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "learning_rate": 0.000295687390542629, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "loss": 0.6453408598899841, + "step": 2500 + }, + { + "ce_loss": 0.2111901193857193, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "distill_loss": 0.16144970059394836, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "ref_ce_loss": 0.16724175214767456, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "loss": 0.7878329753875732, + "step": 2500 + }, + { + "ce_loss": 0.19704513251781464, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "distill_loss": 0.24139323830604553, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "ref_ce_loss": 0.12408225983381271, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "loss": 0.9499678015708923, + "step": 2500 + }, + { + "ce_loss": 0.4058931767940521, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "distill_loss": 0.267963171005249, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "ref_ce_loss": 0.2148558646440506, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "loss": 0.8748348951339722, + "step": 2500 + }, + { + "ce_loss": 0.30255037546157837, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "distill_loss": 0.25724613666534424, + "epoch": 0.8338892595063375, + "step": 2500 + }, + { + "epoch": 0.8338892595063375, + "ref_ce_loss": 0.15020081400871277, + "step": 2500 + }, + { + "epoch": 0.8372248165443629, + "loss": 0.8287, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "grad_norm": 2.9365673065185547, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "learning_rate": 0.0002956390349715589, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "loss": 0.7504816651344299, + "step": 2510 + }, + { + "ce_loss": 0.32103756070137024, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "distill_loss": 0.15747405588626862, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "ref_ce_loss": 0.19202551245689392, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "loss": 0.6597751379013062, + "step": 2510 + }, + { + "ce_loss": 0.20975995063781738, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "distill_loss": 0.19682608544826508, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "ref_ce_loss": 0.13535785675048828, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "loss": 0.6817447543144226, + "step": 2510 + }, + { + "ce_loss": 0.238074392080307, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "distill_loss": 0.15745586156845093, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "ref_ce_loss": 0.16773714125156403, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "loss": 0.4860121011734009, + "step": 2510 + }, + { + "ce_loss": 0.17487522959709167, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "distill_loss": 0.17117461562156677, + "epoch": 0.8372248165443629, + "step": 2510 + }, + { + "epoch": 0.8372248165443629, + "ref_ce_loss": 0.09342694282531738, + "step": 2510 + }, + { + "epoch": 0.8405603735823882, + "loss": 0.7391, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "grad_norm": 2.870335578918457, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "learning_rate": 0.0002955904138123389, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "loss": 0.6082489490509033, + "step": 2520 + }, + { + "ce_loss": 0.2526482045650482, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "distill_loss": 0.09827655553817749, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "ref_ce_loss": 0.1371910274028778, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "loss": 0.8684769868850708, + "step": 2520 + }, + { + "ce_loss": 0.2551835775375366, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "distill_loss": 0.15360848605632782, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "ref_ce_loss": 0.19132332503795624, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "loss": 0.6976226568222046, + "step": 2520 + }, + { + "ce_loss": 0.31281253695487976, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "distill_loss": 0.15646786987781525, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "ref_ce_loss": 0.16439111530780792, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "loss": 0.5785008668899536, + "step": 2520 + }, + { + "ce_loss": 0.27898046374320984, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "distill_loss": 0.12810131907463074, + "epoch": 0.8405603735823882, + "step": 2520 + }, + { + "epoch": 0.8405603735823882, + "ref_ce_loss": 0.11890950053930283, + "step": 2520 + }, + { + "epoch": 0.8438959306204136, + "loss": 0.8542, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "grad_norm": 2.7163124084472656, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "learning_rate": 0.0002955415271536349, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "loss": 0.9421725869178772, + "step": 2530 + }, + { + "ce_loss": 0.3706272840499878, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "distill_loss": 0.23287363350391388, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "ref_ce_loss": 0.20354139804840088, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "loss": 0.524922251701355, + "step": 2530 + }, + { + "ce_loss": 0.15988919138908386, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "distill_loss": 0.1579529196023941, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "ref_ce_loss": 0.09271088242530823, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "loss": 0.6980333924293518, + "step": 2530 + }, + { + "ce_loss": 0.266355961561203, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "distill_loss": 0.1916055679321289, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "ref_ce_loss": 0.18757179379463196, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "loss": 0.8313827514648438, + "step": 2530 + }, + { + "ce_loss": 0.213656947016716, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "distill_loss": 0.2044852375984192, + "epoch": 0.8438959306204136, + "step": 2530 + }, + { + "epoch": 0.8438959306204136, + "ref_ce_loss": 0.21953712403774261, + "step": 2530 + }, + { + "epoch": 0.8472314876584389, + "loss": 0.739, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "grad_norm": 1.7960280179977417, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "learning_rate": 0.0002954923750845968, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "loss": 0.6077286601066589, + "step": 2540 + }, + { + "ce_loss": 0.2709120810031891, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "distill_loss": 0.11971865594387054, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "ref_ce_loss": 0.1525728404521942, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "loss": 0.5749415755271912, + "step": 2540 + }, + { + "ce_loss": 0.23235177993774414, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "distill_loss": 0.11225643754005432, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "ref_ce_loss": 0.10792374610900879, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "loss": 0.762617826461792, + "step": 2540 + }, + { + "ce_loss": 0.3228139877319336, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "distill_loss": 0.13279499113559723, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "ref_ce_loss": 0.14764104783535004, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "loss": 0.9446433782577515, + "step": 2540 + }, + { + "ce_loss": 0.4045880436897278, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "distill_loss": 0.1612992286682129, + "epoch": 0.8472314876584389, + "step": 2540 + }, + { + "epoch": 0.8472314876584389, + "ref_ce_loss": 0.20145398378372192, + "step": 2540 + }, + { + "epoch": 0.8505670446964643, + "loss": 0.7286, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "grad_norm": 2.043759346008301, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "learning_rate": 0.0002954429576948586, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "loss": 0.6958951354026794, + "step": 2550 + }, + { + "ce_loss": 0.2730863094329834, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "distill_loss": 0.16268062591552734, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "ref_ce_loss": 0.16490232944488525, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "loss": 0.48699092864990234, + "step": 2550 + }, + { + "ce_loss": 0.19955317676067352, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "distill_loss": 0.1272369772195816, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "ref_ce_loss": 0.12268470227718353, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "loss": 0.6086199283599854, + "step": 2550 + }, + { + "ce_loss": 0.19976550340652466, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "distill_loss": 0.14242056012153625, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "ref_ce_loss": 0.1820511668920517, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "loss": 1.2203011512756348, + "step": 2550 + }, + { + "ce_loss": 0.29445260763168335, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "distill_loss": 0.14953580498695374, + "epoch": 0.8505670446964643, + "step": 2550 + }, + { + "epoch": 0.8505670446964643, + "ref_ce_loss": 0.17585910856723785, + "step": 2550 + }, + { + "epoch": 0.8539026017344896, + "loss": 0.7847, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "grad_norm": 4.350864887237549, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "learning_rate": 0.0002953932750745382, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "loss": 1.0202877521514893, + "step": 2560 + }, + { + "ce_loss": 0.3814989924430847, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "distill_loss": 0.15244382619857788, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "ref_ce_loss": 0.2545083463191986, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "loss": 0.6912614107131958, + "step": 2560 + }, + { + "ce_loss": 0.17741495370864868, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "distill_loss": 0.12845268845558167, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "ref_ce_loss": 0.10197576880455017, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "loss": 0.783969521522522, + "step": 2560 + }, + { + "ce_loss": 0.2941688597202301, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "distill_loss": 0.14433158934116364, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "ref_ce_loss": 0.12809965014457703, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "loss": 0.8776670694351196, + "step": 2560 + }, + { + "ce_loss": 0.3103247582912445, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "distill_loss": 0.1200985312461853, + "epoch": 0.8539026017344896, + "step": 2560 + }, + { + "epoch": 0.8539026017344896, + "ref_ce_loss": 0.16940616071224213, + "step": 2560 + }, + { + "epoch": 0.857238158772515, + "loss": 0.7478, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "grad_norm": 2.9963643550872803, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "learning_rate": 0.0002953433273142369, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "loss": 0.6163371801376343, + "step": 2570 + }, + { + "ce_loss": 0.18766649067401886, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "distill_loss": 0.13034361600875854, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "ref_ce_loss": 0.15359312295913696, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "loss": 0.6791805624961853, + "step": 2570 + }, + { + "ce_loss": 0.30410608649253845, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "distill_loss": 0.11348666250705719, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "ref_ce_loss": 0.16625384986400604, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "loss": 0.94990074634552, + "step": 2570 + }, + { + "ce_loss": 0.40707695484161377, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "distill_loss": 0.17179466784000397, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "ref_ce_loss": 0.18617656826972961, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "loss": 0.56731116771698, + "step": 2570 + }, + { + "ce_loss": 0.25088340044021606, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "distill_loss": 0.10557133704423904, + "epoch": 0.857238158772515, + "step": 2570 + }, + { + "epoch": 0.857238158772515, + "ref_ce_loss": 0.15625420212745667, + "step": 2570 + }, + { + "epoch": 0.8605737158105403, + "loss": 0.8025, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "grad_norm": 2.4704360961914062, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "learning_rate": 0.0002952931145050399, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "loss": 0.7413711547851562, + "step": 2580 + }, + { + "ce_loss": 0.21104562282562256, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "distill_loss": 0.19385764002799988, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "ref_ce_loss": 0.15575964748859406, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "loss": 0.779699981212616, + "step": 2580 + }, + { + "ce_loss": 0.26666852831840515, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "distill_loss": 0.1735392063856125, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "ref_ce_loss": 0.26088887453079224, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "loss": 0.9133224487304688, + "step": 2580 + }, + { + "ce_loss": 0.2493307888507843, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "distill_loss": 0.16577640175819397, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "ref_ce_loss": 0.1426527500152588, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "loss": 1.1199960708618164, + "step": 2580 + }, + { + "ce_loss": 0.2786944508552551, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "distill_loss": 0.191022589802742, + "epoch": 0.8605737158105403, + "step": 2580 + }, + { + "epoch": 0.8605737158105403, + "ref_ce_loss": 0.11835911870002747, + "step": 2580 + }, + { + "epoch": 0.8639092728485657, + "loss": 0.7495, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "grad_norm": 1.9572570323944092, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "learning_rate": 0.00029524263673851557, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "loss": 0.6552140712738037, + "step": 2590 + }, + { + "ce_loss": 0.23570433259010315, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "distill_loss": 0.13375253975391388, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "ref_ce_loss": 0.18127413094043732, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "loss": 0.8274029493331909, + "step": 2590 + }, + { + "ce_loss": 0.3903733193874359, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "distill_loss": 0.15356363356113434, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "ref_ce_loss": 0.2296900451183319, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "loss": 0.6463877558708191, + "step": 2590 + }, + { + "ce_loss": 0.2714865803718567, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "distill_loss": 0.1532272845506668, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "ref_ce_loss": 0.17067858576774597, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "loss": 0.6551008224487305, + "step": 2590 + }, + { + "ce_loss": 0.24287007749080658, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "distill_loss": 0.12412548065185547, + "epoch": 0.8639092728485657, + "step": 2590 + }, + { + "epoch": 0.8639092728485657, + "ref_ce_loss": 0.15614332258701324, + "step": 2590 + }, + { + "epoch": 0.867244829886591, + "loss": 0.7183, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "grad_norm": 1.8994405269622803, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "learning_rate": 0.0002951918941067153, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "loss": 0.7773920297622681, + "step": 2600 + }, + { + "ce_loss": 0.3620452880859375, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "distill_loss": 0.1090983971953392, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "ref_ce_loss": 0.2240307331085205, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "loss": 0.7050237059593201, + "step": 2600 + }, + { + "ce_loss": 0.2841552197933197, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "distill_loss": 0.10316047817468643, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "ref_ce_loss": 0.1758444905281067, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "loss": 1.683584451675415, + "step": 2600 + }, + { + "ce_loss": 0.2757621109485626, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "distill_loss": 0.10363587737083435, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "ref_ce_loss": 0.1815105527639389, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "loss": 0.6704810857772827, + "step": 2600 + }, + { + "ce_loss": 0.190278559923172, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "distill_loss": 0.1048944815993309, + "epoch": 0.867244829886591, + "step": 2600 + }, + { + "epoch": 0.867244829886591, + "ref_ce_loss": 0.1369696855545044, + "step": 2600 + }, + { + "epoch": 0.8705803869246164, + "loss": 0.7809, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "grad_norm": 4.59869909286499, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "learning_rate": 0.0002951408867021737, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "loss": 0.6961145401000977, + "step": 2610 + }, + { + "ce_loss": 0.20867450535297394, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "distill_loss": 0.222753643989563, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "ref_ce_loss": 0.16406960785388947, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "loss": 0.7274801731109619, + "step": 2610 + }, + { + "ce_loss": 0.2560276389122009, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "distill_loss": 0.27618464827537537, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "ref_ce_loss": 0.1208532452583313, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "loss": 0.530335009098053, + "step": 2610 + }, + { + "ce_loss": 0.18852654099464417, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "distill_loss": 0.20412476360797882, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "ref_ce_loss": 0.13730789721012115, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "loss": 0.5325140953063965, + "step": 2610 + }, + { + "ce_loss": 0.15463118255138397, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "distill_loss": 0.20978079736232758, + "epoch": 0.8705803869246164, + "step": 2610 + }, + { + "epoch": 0.8705803869246164, + "ref_ce_loss": 0.11247812956571579, + "step": 2610 + }, + { + "epoch": 0.8739159439626417, + "loss": 0.7297, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "grad_norm": 2.0906076431274414, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "learning_rate": 0.0002950896146179082, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "loss": 0.6182748079299927, + "step": 2620 + }, + { + "ce_loss": 0.30032458901405334, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "distill_loss": 0.17088757455348969, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "ref_ce_loss": 0.14578019082546234, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "loss": 0.6845272183418274, + "step": 2620 + }, + { + "ce_loss": 0.2438613623380661, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "distill_loss": 0.1726510226726532, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "ref_ce_loss": 0.16975204646587372, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "loss": 0.9628136157989502, + "step": 2620 + }, + { + "ce_loss": 0.22766920924186707, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "distill_loss": 0.16870301961898804, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "ref_ce_loss": 0.14269216358661652, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "loss": 0.4854929745197296, + "step": 2620 + }, + { + "ce_loss": 0.16093426942825317, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "distill_loss": 0.11879430711269379, + "epoch": 0.8739159439626417, + "step": 2620 + }, + { + "epoch": 0.8739159439626417, + "ref_ce_loss": 0.14466573297977448, + "step": 2620 + }, + { + "epoch": 0.8772515010006671, + "loss": 0.7219, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "grad_norm": 2.8054723739624023, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "learning_rate": 0.0002950380779474188, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "loss": 1.1032121181488037, + "step": 2630 + }, + { + "ce_loss": 0.27350157499313354, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "distill_loss": 0.14578324556350708, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "ref_ce_loss": 0.16324932873249054, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "loss": 0.5892089605331421, + "step": 2630 + }, + { + "ce_loss": 0.23559221625328064, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "distill_loss": 0.15483921766281128, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "ref_ce_loss": 0.11848069727420807, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "loss": 0.6435602307319641, + "step": 2630 + }, + { + "ce_loss": 0.2746572196483612, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "distill_loss": 0.13225823640823364, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "ref_ce_loss": 0.13194727897644043, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "loss": 0.7193164229393005, + "step": 2630 + }, + { + "ce_loss": 0.20938901603221893, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "distill_loss": 0.15860141813755035, + "epoch": 0.8772515010006671, + "step": 2630 + }, + { + "epoch": 0.8772515010006671, + "ref_ce_loss": 0.17175227403640747, + "step": 2630 + }, + { + "epoch": 0.8805870580386924, + "loss": 0.762, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "grad_norm": 2.040764570236206, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "learning_rate": 0.00029498627678468806, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "loss": 1.0706512928009033, + "step": 2640 + }, + { + "ce_loss": 0.2734338045120239, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "distill_loss": 0.17937436699867249, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "ref_ce_loss": 0.2127029448747635, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "loss": 0.7411646842956543, + "step": 2640 + }, + { + "ce_loss": 0.1989816427230835, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "distill_loss": 0.18253065645694733, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "ref_ce_loss": 0.1566845029592514, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "loss": 0.7127244472503662, + "step": 2640 + }, + { + "ce_loss": 0.18785162270069122, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "distill_loss": 0.13022294640541077, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "ref_ce_loss": 0.14390313625335693, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "loss": 0.3556101322174072, + "step": 2640 + }, + { + "ce_loss": 0.15927663445472717, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "distill_loss": 0.12913666665554047, + "epoch": 0.8805870580386924, + "step": 2640 + }, + { + "epoch": 0.8805870580386924, + "ref_ce_loss": 0.06653880327939987, + "step": 2640 + }, + { + "epoch": 0.8839226150767178, + "loss": 0.7489, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "grad_norm": 2.5249640941619873, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "learning_rate": 0.0002949342112241809, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "loss": 0.5982322692871094, + "step": 2650 + }, + { + "ce_loss": 0.20801950991153717, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "distill_loss": 0.09348731487989426, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "ref_ce_loss": 0.13889434933662415, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "loss": 0.539240837097168, + "step": 2650 + }, + { + "ce_loss": 0.2223464399576187, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "distill_loss": 0.12154768407344818, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "ref_ce_loss": 0.19085489213466644, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "loss": 0.524050772190094, + "step": 2650 + }, + { + "ce_loss": 0.24115309119224548, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "distill_loss": 0.10297761112451553, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "ref_ce_loss": 0.13751719892024994, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "loss": 0.3736794590950012, + "step": 2650 + }, + { + "ce_loss": 0.10971083492040634, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "distill_loss": 0.10210859775543213, + "epoch": 0.8839226150767178, + "step": 2650 + }, + { + "epoch": 0.8839226150767178, + "ref_ce_loss": 0.09514066576957703, + "step": 2650 + }, + { + "epoch": 0.8872581721147431, + "loss": 0.6912, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "grad_norm": 2.7994887828826904, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "learning_rate": 0.00029488188136084437, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "loss": 1.0558778047561646, + "step": 2660 + }, + { + "ce_loss": 0.32495981454849243, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "distill_loss": 0.14164505898952484, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "ref_ce_loss": 0.22562389075756073, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "loss": 0.602891206741333, + "step": 2660 + }, + { + "ce_loss": 0.21373873949050903, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "distill_loss": 0.12601274251937866, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "ref_ce_loss": 0.19673238694667816, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "loss": 0.9606189727783203, + "step": 2660 + }, + { + "ce_loss": 0.35660284757614136, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "distill_loss": 0.1498052179813385, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "ref_ce_loss": 0.19679509103298187, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "loss": 0.6719042658805847, + "step": 2660 + }, + { + "ce_loss": 0.24385878443717957, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "distill_loss": 0.1391836255788803, + "epoch": 0.8872581721147431, + "step": 2660 + }, + { + "epoch": 0.8872581721147431, + "ref_ce_loss": 0.1761963814496994, + "step": 2660 + }, + { + "epoch": 0.8905937291527685, + "loss": 0.7776, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "grad_norm": 4.569013595581055, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "learning_rate": 0.0002948292872901074, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "loss": 0.5371503233909607, + "step": 2670 + }, + { + "ce_loss": 0.2493179589509964, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "distill_loss": 0.11686110496520996, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "ref_ce_loss": 0.14364975690841675, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "loss": 0.7579875588417053, + "step": 2670 + }, + { + "ce_loss": 0.29506489634513855, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "distill_loss": 0.13039056956768036, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "ref_ce_loss": 0.20609255135059357, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "loss": 0.6515138745307922, + "step": 2670 + }, + { + "ce_loss": 0.2613978981971741, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "distill_loss": 0.16587205231189728, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "ref_ce_loss": 0.1350294053554535, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "loss": 0.7277687788009644, + "step": 2670 + }, + { + "ce_loss": 0.28890296816825867, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "distill_loss": 0.15814818441867828, + "epoch": 0.8905937291527685, + "step": 2670 + }, + { + "epoch": 0.8905937291527685, + "ref_ce_loss": 0.19958439469337463, + "step": 2670 + }, + { + "epoch": 0.8939292861907938, + "loss": 0.698, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "grad_norm": 1.6606212854385376, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "learning_rate": 0.000294776429107881, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "loss": 0.7957069873809814, + "step": 2680 + }, + { + "ce_loss": 0.3352309465408325, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "distill_loss": 0.11121989041566849, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "ref_ce_loss": 0.20472891628742218, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "loss": 0.7181699275970459, + "step": 2680 + }, + { + "ce_loss": 0.2733495235443115, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "distill_loss": 0.11020343005657196, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "ref_ce_loss": 0.135089710354805, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "loss": 0.8474379777908325, + "step": 2680 + }, + { + "ce_loss": 0.3044080138206482, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "distill_loss": 0.1047251969575882, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "ref_ce_loss": 0.12011805176734924, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "loss": 0.8199704885482788, + "step": 2680 + }, + { + "ce_loss": 0.35054662823677063, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "distill_loss": 0.13516145944595337, + "epoch": 0.8939292861907938, + "step": 2680 + }, + { + "epoch": 0.8939292861907938, + "ref_ce_loss": 0.18137814104557037, + "step": 2680 + }, + { + "epoch": 0.8972648432288192, + "loss": 0.678, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "grad_norm": 2.979337692260742, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "learning_rate": 0.0002947233069105575, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "loss": 1.02372145652771, + "step": 2690 + }, + { + "ce_loss": 0.2200205773115158, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "distill_loss": 0.10067467391490936, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "ref_ce_loss": 0.17001208662986755, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "loss": 0.5303847789764404, + "step": 2690 + }, + { + "ce_loss": 0.2386377602815628, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "distill_loss": 0.11323900520801544, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "ref_ce_loss": 0.17847669124603271, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "loss": 0.7682243585586548, + "step": 2690 + }, + { + "ce_loss": 0.3582816421985626, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "distill_loss": 0.1306098848581314, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "ref_ce_loss": 0.21609355509281158, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "loss": 0.5860241651535034, + "step": 2690 + }, + { + "ce_loss": 0.2113438844680786, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "distill_loss": 0.10160598903894424, + "epoch": 0.8972648432288192, + "step": 2690 + }, + { + "epoch": 0.8972648432288192, + "ref_ce_loss": 0.17873786389827728, + "step": 2690 + }, + { + "epoch": 0.9006004002668445, + "loss": 0.8651, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "grad_norm": 2.6056735515594482, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "learning_rate": 0.0002946699207950109, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "loss": 0.6318261623382568, + "step": 2700 + }, + { + "ce_loss": 0.2618946135044098, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "distill_loss": 0.11020223796367645, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "ref_ce_loss": 0.1937880516052246, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "loss": 1.0718632936477661, + "step": 2700 + }, + { + "ce_loss": 0.27328452467918396, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "distill_loss": 0.13602453470230103, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "ref_ce_loss": 0.24140718579292297, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "loss": 0.6519533395767212, + "step": 2700 + }, + { + "ce_loss": 0.2659465968608856, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "distill_loss": 0.11393342167139053, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "ref_ce_loss": 0.14486336708068848, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "loss": 0.459310919046402, + "step": 2700 + }, + { + "ce_loss": 0.16971252858638763, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "distill_loss": 0.10560064762830734, + "epoch": 0.9006004002668445, + "step": 2700 + }, + { + "epoch": 0.9006004002668445, + "ref_ce_loss": 0.11843426525592804, + "step": 2700 + }, + { + "epoch": 0.9039359573048699, + "loss": 0.8334, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "grad_norm": 2.7582480907440186, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "learning_rate": 0.0002946162708585964, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "loss": 0.5983908176422119, + "step": 2710 + }, + { + "ce_loss": 0.22176890075206757, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "distill_loss": 0.14231596887111664, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "ref_ce_loss": 0.1662299633026123, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "loss": 0.7019459009170532, + "step": 2710 + }, + { + "ce_loss": 0.21000699698925018, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "distill_loss": 0.13449858129024506, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "ref_ce_loss": 0.15763014554977417, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "loss": 0.46816378831863403, + "step": 2710 + }, + { + "ce_loss": 0.1613132506608963, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "distill_loss": 0.14022500813007355, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "ref_ce_loss": 0.12063966691493988, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "loss": 0.8382180333137512, + "step": 2710 + }, + { + "ce_loss": 0.20074015855789185, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "distill_loss": 0.12996117770671844, + "epoch": 0.9039359573048699, + "step": 2710 + }, + { + "epoch": 0.9039359573048699, + "ref_ce_loss": 0.12067507207393646, + "step": 2710 + }, + { + "epoch": 0.9072715143428952, + "loss": 0.7523, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "grad_norm": 2.3118250370025635, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "learning_rate": 0.0002945623571991503, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "loss": 0.6837984919548035, + "step": 2720 + }, + { + "ce_loss": 0.32145455479621887, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "distill_loss": 0.1139432042837143, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "ref_ce_loss": 0.20749621093273163, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "loss": 0.5676144361495972, + "step": 2720 + }, + { + "ce_loss": 0.24475722014904022, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "distill_loss": 0.1074301153421402, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "ref_ce_loss": 0.14644326269626617, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "loss": 0.7602922916412354, + "step": 2720 + }, + { + "ce_loss": 0.26372280716896057, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "distill_loss": 0.11422452330589294, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "ref_ce_loss": 0.1716698259115219, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "loss": 0.8460100293159485, + "step": 2720 + }, + { + "ce_loss": 0.19480058550834656, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "distill_loss": 0.09263000637292862, + "epoch": 0.9072715143428952, + "step": 2720 + }, + { + "epoch": 0.9072715143428952, + "ref_ce_loss": 0.16573220491409302, + "step": 2720 + }, + { + "epoch": 0.9106070713809206, + "loss": 0.7049, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "grad_norm": 3.8387629985809326, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "learning_rate": 0.0002945081799149899, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "loss": 0.7528659105300903, + "step": 2730 + }, + { + "ce_loss": 0.24429547786712646, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "distill_loss": 0.13893257081508636, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "ref_ce_loss": 0.16770748794078827, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "loss": 0.8350508213043213, + "step": 2730 + }, + { + "ce_loss": 0.17379887402057648, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "distill_loss": 0.11916208267211914, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "ref_ce_loss": 0.12559403479099274, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "loss": 0.8810762763023376, + "step": 2730 + }, + { + "ce_loss": 0.3514857292175293, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "distill_loss": 0.1322062462568283, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "ref_ce_loss": 0.2726250886917114, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "loss": 0.6011776924133301, + "step": 2730 + }, + { + "ce_loss": 0.2854574918746948, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "distill_loss": 0.13027696311473846, + "epoch": 0.9106070713809206, + "step": 2730 + }, + { + "epoch": 0.9106070713809206, + "ref_ce_loss": 0.18520604074001312, + "step": 2730 + }, + { + "epoch": 0.9139426284189459, + "loss": 0.7369, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "grad_norm": 3.095705509185791, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "learning_rate": 0.0002944537391049131, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "loss": 0.996353268623352, + "step": 2740 + }, + { + "ce_loss": 0.30720749497413635, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "distill_loss": 0.27163252234458923, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "ref_ce_loss": 0.169632688164711, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "loss": 0.804191529750824, + "step": 2740 + }, + { + "ce_loss": 0.30768176913261414, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "distill_loss": 0.28657642006874084, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "ref_ce_loss": 0.20976907014846802, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "loss": 0.6727897524833679, + "step": 2740 + }, + { + "ce_loss": 0.24741102755069733, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "distill_loss": 0.2660277187824249, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "ref_ce_loss": 0.12837496399879456, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "loss": 0.810462474822998, + "step": 2740 + }, + { + "ce_loss": 0.18826617300510406, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "distill_loss": 0.2099066823720932, + "epoch": 0.9139426284189459, + "step": 2740 + }, + { + "epoch": 0.9139426284189459, + "ref_ce_loss": 0.10664959996938705, + "step": 2740 + }, + { + "epoch": 0.9172781854569713, + "loss": 0.8526, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "grad_norm": 2.9879446029663086, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "learning_rate": 0.00029439903486819854, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "loss": 0.7249727845191956, + "step": 2750 + }, + { + "ce_loss": 0.2981150448322296, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "distill_loss": 0.13929876685142517, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "ref_ce_loss": 0.17144180834293365, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "loss": 1.1745476722717285, + "step": 2750 + }, + { + "ce_loss": 0.2252090722322464, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "distill_loss": 0.12722156941890717, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "ref_ce_loss": 0.15686529874801636, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "loss": 0.8217834234237671, + "step": 2750 + }, + { + "ce_loss": 0.28482645750045776, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "distill_loss": 0.16513006389141083, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "ref_ce_loss": 0.1964171677827835, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "loss": 0.5900613069534302, + "step": 2750 + }, + { + "ce_loss": 0.2149066925048828, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "distill_loss": 0.13745594024658203, + "epoch": 0.9172781854569713, + "step": 2750 + }, + { + "epoch": 0.9172781854569713, + "ref_ce_loss": 0.1548822671175003, + "step": 2750 + }, + { + "epoch": 0.9206137424949966, + "loss": 0.7581, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "grad_norm": 2.9715447425842285, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "learning_rate": 0.0002943440673046052, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "loss": 1.3678593635559082, + "step": 2760 + }, + { + "ce_loss": 0.3911042809486389, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "distill_loss": 0.3210551142692566, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "ref_ce_loss": 0.1883934587240219, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "loss": 0.7819101810455322, + "step": 2760 + }, + { + "ce_loss": 0.22631727159023285, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "distill_loss": 0.27888303995132446, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "ref_ce_loss": 0.15311084687709808, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "loss": 1.427027702331543, + "step": 2760 + }, + { + "ce_loss": 0.2932255268096924, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "distill_loss": 0.3127667009830475, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "ref_ce_loss": 0.1879715919494629, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "loss": 0.8832669258117676, + "step": 2760 + }, + { + "ce_loss": 0.3134843111038208, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "distill_loss": 0.27922841906547546, + "epoch": 0.9206137424949966, + "step": 2760 + }, + { + "epoch": 0.9206137424949966, + "ref_ce_loss": 0.15278160572052002, + "step": 2760 + }, + { + "epoch": 0.923949299533022, + "loss": 0.8373, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "grad_norm": 2.631944417953491, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "learning_rate": 0.0002942888365143721, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "loss": 0.9981451630592346, + "step": 2770 + }, + { + "ce_loss": 0.22759626805782318, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "distill_loss": 0.19656261801719666, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "ref_ce_loss": 0.19053815305233002, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "loss": 0.6683474779129028, + "step": 2770 + }, + { + "ce_loss": 0.2622044086456299, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "distill_loss": 0.2272324562072754, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "ref_ce_loss": 0.17883671820163727, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "loss": 0.6071282625198364, + "step": 2770 + }, + { + "ce_loss": 0.20569933950901031, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "distill_loss": 0.20898611843585968, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "ref_ce_loss": 0.09778912365436554, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "loss": 0.5389513969421387, + "step": 2770 + }, + { + "ce_loss": 0.17628800868988037, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "distill_loss": 0.18154376745224, + "epoch": 0.923949299533022, + "step": 2770 + }, + { + "epoch": 0.923949299533022, + "ref_ce_loss": 0.14046207070350647, + "step": 2770 + }, + { + "epoch": 0.9272848565710473, + "loss": 0.808, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "grad_norm": 2.491095781326294, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "learning_rate": 0.00029423334259821854, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "loss": 0.7875550389289856, + "step": 2780 + }, + { + "ce_loss": 0.29149720072746277, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "distill_loss": 0.26492205262184143, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "ref_ce_loss": 0.15465959906578064, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "loss": 0.7306146621704102, + "step": 2780 + }, + { + "ce_loss": 0.29060646891593933, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "distill_loss": 0.22705847024917603, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "ref_ce_loss": 0.14852774143218994, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "loss": 0.7648739814758301, + "step": 2780 + }, + { + "ce_loss": 0.3341917097568512, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "distill_loss": 0.227829247713089, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "ref_ce_loss": 0.14977265894412994, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "loss": 0.984939694404602, + "step": 2780 + }, + { + "ce_loss": 0.37465381622314453, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "distill_loss": 0.2559279799461365, + "epoch": 0.9272848565710473, + "step": 2780 + }, + { + "epoch": 0.9272848565710473, + "ref_ce_loss": 0.2502400577068329, + "step": 2780 + }, + { + "epoch": 0.9306204136090727, + "loss": 0.8007, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "grad_norm": 2.9029388427734375, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "learning_rate": 0.0002941775856573435, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "loss": 0.7560724020004272, + "step": 2790 + }, + { + "ce_loss": 0.24939562380313873, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "distill_loss": 0.24355027079582214, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "ref_ce_loss": 0.17152981460094452, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "loss": 0.8174520134925842, + "step": 2790 + }, + { + "ce_loss": 0.2900400757789612, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "distill_loss": 0.2231612205505371, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "ref_ce_loss": 0.2126293033361435, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "loss": 0.9072144031524658, + "step": 2790 + }, + { + "ce_loss": 0.2195039987564087, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "distill_loss": 0.22568339109420776, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "ref_ce_loss": 0.10249602794647217, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "loss": 0.7943201065063477, + "step": 2790 + }, + { + "ce_loss": 0.23619568347930908, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "distill_loss": 0.19247688353061676, + "epoch": 0.9306204136090727, + "step": 2790 + }, + { + "epoch": 0.9306204136090727, + "ref_ce_loss": 0.14785772562026978, + "step": 2790 + }, + { + "epoch": 0.933955970647098, + "loss": 0.7842, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "grad_norm": 2.3317878246307373, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "learning_rate": 0.0002941215657934256, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "loss": 0.7021505236625671, + "step": 2800 + }, + { + "ce_loss": 0.34092476963996887, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "distill_loss": 0.15701162815093994, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "ref_ce_loss": 0.20417509973049164, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "loss": 0.6944257020950317, + "step": 2800 + }, + { + "ce_loss": 0.2710808515548706, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "distill_loss": 0.21411262452602386, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "ref_ce_loss": 0.17248453199863434, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "loss": 0.3621445596218109, + "step": 2800 + }, + { + "ce_loss": 0.14474163949489594, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "distill_loss": 0.14359624683856964, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "ref_ce_loss": 0.07347650825977325, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "loss": 0.8690259456634521, + "step": 2800 + }, + { + "ce_loss": 0.3273351192474365, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "distill_loss": 0.19411294162273407, + "epoch": 0.933955970647098, + "step": 2800 + }, + { + "epoch": 0.933955970647098, + "ref_ce_loss": 0.16561636328697205, + "step": 2800 + }, + { + "epoch": 0.9372915276851234, + "loss": 0.7396, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "grad_norm": 1.872754693031311, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "learning_rate": 0.00029406528310862306, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "loss": 0.6328253746032715, + "step": 2810 + }, + { + "ce_loss": 0.26702645421028137, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "distill_loss": 0.11107096076011658, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "ref_ce_loss": 0.12137383967638016, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "loss": 0.7653927803039551, + "step": 2810 + }, + { + "ce_loss": 0.32979145646095276, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "distill_loss": 0.14815056324005127, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "ref_ce_loss": 0.14542245864868164, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "loss": 0.793331503868103, + "step": 2810 + }, + { + "ce_loss": 0.28229960799217224, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "distill_loss": 0.14099378883838654, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "ref_ce_loss": 0.1897796094417572, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "loss": 0.4289068877696991, + "step": 2810 + }, + { + "ce_loss": 0.12203547358512878, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "distill_loss": 0.1050165444612503, + "epoch": 0.9372915276851234, + "step": 2810 + }, + { + "epoch": 0.9372915276851234, + "ref_ce_loss": 0.10393452644348145, + "step": 2810 + }, + { + "epoch": 0.9406270847231488, + "loss": 0.8713, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "grad_norm": 5.31164026260376, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "learning_rate": 0.00029400873770557323, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "loss": 0.552436888217926, + "step": 2820 + }, + { + "ce_loss": 0.17268088459968567, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "distill_loss": 0.26252999901771545, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "ref_ce_loss": 0.11688818782567978, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "loss": 0.6593774557113647, + "step": 2820 + }, + { + "ce_loss": 0.16558508574962616, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "distill_loss": 0.21160022914409637, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "ref_ce_loss": 0.1414853036403656, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "loss": 0.8572372198104858, + "step": 2820 + }, + { + "ce_loss": 0.2560436427593231, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "distill_loss": 0.2739320695400238, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "ref_ce_loss": 0.1760285347700119, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "loss": 0.812274694442749, + "step": 2820 + }, + { + "ce_loss": 0.21347567439079285, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "distill_loss": 0.31114456057548523, + "epoch": 0.9406270847231488, + "step": 2820 + }, + { + "epoch": 0.9406270847231488, + "ref_ce_loss": 0.15867622196674347, + "step": 2820 + }, + { + "epoch": 0.9439626417611742, + "loss": 1.0656, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "grad_norm": 3.638387441635132, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "learning_rate": 0.00029395192968739264, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "loss": 1.7968251705169678, + "step": 2830 + }, + { + "ce_loss": 0.4090914726257324, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "distill_loss": 0.6144574880599976, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "ref_ce_loss": 0.1680503636598587, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "loss": 0.9041198492050171, + "step": 2830 + }, + { + "ce_loss": 0.22265367209911346, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "distill_loss": 0.37538236379623413, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "ref_ce_loss": 0.160588338971138, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "loss": 0.7223681807518005, + "step": 2830 + }, + { + "ce_loss": 0.18482926487922668, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "distill_loss": 0.3430609107017517, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "ref_ce_loss": 0.11669516563415527, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "loss": 0.8510680794715881, + "step": 2830 + }, + { + "ce_loss": 0.12880350649356842, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "distill_loss": 0.42931199073791504, + "epoch": 0.9439626417611742, + "step": 2830 + }, + { + "epoch": 0.9439626417611742, + "ref_ce_loss": 0.11357249319553375, + "step": 2830 + }, + { + "epoch": 0.9472981987991995, + "loss": 0.9909, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "grad_norm": 2.5945699214935303, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "learning_rate": 0.00029389485915767675, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "loss": 1.5304577350616455, + "step": 2840 + }, + { + "ce_loss": 0.37256667017936707, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "distill_loss": 0.2691047191619873, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "ref_ce_loss": 0.16921545565128326, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "loss": 0.7942785620689392, + "step": 2840 + }, + { + "ce_loss": 0.2432491034269333, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "distill_loss": 0.299993097782135, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "ref_ce_loss": 0.12675027549266815, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "loss": 0.7761873006820679, + "step": 2840 + }, + { + "ce_loss": 0.2942223846912384, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "distill_loss": 0.3370579481124878, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "ref_ce_loss": 0.14476382732391357, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "loss": 0.8187755942344666, + "step": 2840 + }, + { + "ce_loss": 0.16483481228351593, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "distill_loss": 0.3002135455608368, + "epoch": 0.9472981987991995, + "step": 2840 + }, + { + "epoch": 0.9472981987991995, + "ref_ce_loss": 0.1228487640619278, + "step": 2840 + }, + { + "epoch": 0.9506337558372249, + "loss": 0.781, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "grad_norm": 3.0309741497039795, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "learning_rate": 0.0002938375262204996, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "loss": 0.8299252390861511, + "step": 2850 + }, + { + "ce_loss": 0.18990515172481537, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "distill_loss": 0.16923873126506805, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "ref_ce_loss": 0.11406919360160828, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "loss": 0.6928610801696777, + "step": 2850 + }, + { + "ce_loss": 0.23902356624603271, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "distill_loss": 0.19913634657859802, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "ref_ce_loss": 0.18962852656841278, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "loss": 0.7069710493087769, + "step": 2850 + }, + { + "ce_loss": 0.24961289763450623, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "distill_loss": 0.15987449884414673, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "ref_ce_loss": 0.22801867127418518, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "loss": 1.0759639739990234, + "step": 2850 + }, + { + "ce_loss": 0.1746206283569336, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "distill_loss": 0.1477518379688263, + "epoch": 0.9506337558372249, + "step": 2850 + }, + { + "epoch": 0.9506337558372249, + "ref_ce_loss": 0.1326025128364563, + "step": 2850 + }, + { + "epoch": 0.9539693128752502, + "loss": 0.7967, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "grad_norm": 2.4280905723571777, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "learning_rate": 0.0002937799309804139, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "loss": 0.4592561423778534, + "step": 2860 + }, + { + "ce_loss": 0.1547430455684662, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "distill_loss": 0.1239887923002243, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "ref_ce_loss": 0.10273492336273193, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "loss": 0.560876190662384, + "step": 2860 + }, + { + "ce_loss": 0.20759513974189758, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "distill_loss": 0.1149979904294014, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "ref_ce_loss": 0.16428732872009277, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "loss": 0.687434732913971, + "step": 2860 + }, + { + "ce_loss": 0.25539320707321167, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "distill_loss": 0.15676721930503845, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "ref_ce_loss": 0.1433422714471817, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "loss": 0.668256402015686, + "step": 2860 + }, + { + "ce_loss": 0.2717263400554657, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "distill_loss": 0.14029839634895325, + "epoch": 0.9539693128752502, + "step": 2860 + }, + { + "epoch": 0.9539693128752502, + "ref_ce_loss": 0.14045491814613342, + "step": 2860 + }, + { + "epoch": 0.9573048699132756, + "loss": 0.724, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "grad_norm": 2.734346866607666, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "learning_rate": 0.0002937220735424506, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "loss": 0.6680867075920105, + "step": 2870 + }, + { + "ce_loss": 0.21752631664276123, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "distill_loss": 0.15151241421699524, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "ref_ce_loss": 0.11191045492887497, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "loss": 0.6690918803215027, + "step": 2870 + }, + { + "ce_loss": 0.16620102524757385, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "distill_loss": 0.16352228820323944, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "ref_ce_loss": 0.1283581703901291, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "loss": 0.8832834959030151, + "step": 2870 + }, + { + "ce_loss": 0.20673666894435883, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "distill_loss": 0.16823595762252808, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "ref_ce_loss": 0.19927184283733368, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "loss": 0.7416874766349792, + "step": 2870 + }, + { + "ce_loss": 0.22405901551246643, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "distill_loss": 0.15006643533706665, + "epoch": 0.9573048699132756, + "step": 2870 + }, + { + "epoch": 0.9573048699132756, + "ref_ce_loss": 0.21225836873054504, + "step": 2870 + }, + { + "epoch": 0.9606404269513009, + "loss": 0.738, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "grad_norm": 2.3339405059814453, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "learning_rate": 0.0002936639540121189, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "loss": 1.1273889541625977, + "step": 2880 + }, + { + "ce_loss": 0.1480511724948883, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "distill_loss": 0.0853370949625969, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "ref_ce_loss": 0.11499093472957611, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "loss": 0.6598379611968994, + "step": 2880 + }, + { + "ce_loss": 0.30389922857284546, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "distill_loss": 0.11932610720396042, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "ref_ce_loss": 0.16129109263420105, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "loss": 0.6228227019309998, + "step": 2880 + }, + { + "ce_loss": 0.26174575090408325, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "distill_loss": 0.1098339632153511, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "ref_ce_loss": 0.17401239275932312, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "loss": 0.491678386926651, + "step": 2880 + }, + { + "ce_loss": 0.208614781498909, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "distill_loss": 0.10472996532917023, + "epoch": 0.9606404269513009, + "step": 2880 + }, + { + "epoch": 0.9606404269513009, + "ref_ce_loss": 0.17823967337608337, + "step": 2880 + }, + { + "epoch": 0.9639759839893263, + "loss": 0.6991, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "grad_norm": 1.9527865648269653, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "learning_rate": 0.0002936055724954059, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "loss": 0.6537126302719116, + "step": 2890 + }, + { + "ce_loss": 0.32857227325439453, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "distill_loss": 0.14058195054531097, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "ref_ce_loss": 0.18424710631370544, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "loss": 0.621168851852417, + "step": 2890 + }, + { + "ce_loss": 0.24660468101501465, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "distill_loss": 0.14954647421836853, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "ref_ce_loss": 0.1393446922302246, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "loss": 0.6661877036094666, + "step": 2890 + }, + { + "ce_loss": 0.26231226325035095, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "distill_loss": 0.12446777522563934, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "ref_ce_loss": 0.21287235617637634, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "loss": 0.4061315953731537, + "step": 2890 + }, + { + "ce_loss": 0.1379050761461258, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "distill_loss": 0.10318372398614883, + "epoch": 0.9639759839893263, + "step": 2890 + }, + { + "epoch": 0.9639759839893263, + "ref_ce_loss": 0.13453784584999084, + "step": 2890 + }, + { + "epoch": 0.9673115410273516, + "loss": 0.6889, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "grad_norm": 1.8287708759307861, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "learning_rate": 0.0002935469290987765, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "loss": 0.6817537546157837, + "step": 2900 + }, + { + "ce_loss": 0.31336405873298645, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "distill_loss": 0.14172199368476868, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "ref_ce_loss": 0.2252521812915802, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "loss": 0.5109325647354126, + "step": 2900 + }, + { + "ce_loss": 0.1633598655462265, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "distill_loss": 0.12366708368062973, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "ref_ce_loss": 0.11546748876571655, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "loss": 0.6547499895095825, + "step": 2900 + }, + { + "ce_loss": 0.29048779606819153, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "distill_loss": 0.1456313133239746, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "ref_ce_loss": 0.21713127195835114, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "loss": 0.6343402862548828, + "step": 2900 + }, + { + "ce_loss": 0.22194309532642365, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "distill_loss": 0.14054368436336517, + "epoch": 0.9673115410273516, + "step": 2900 + }, + { + "epoch": 0.9673115410273516, + "ref_ce_loss": 0.1832578331232071, + "step": 2900 + }, + { + "epoch": 0.970647098065377, + "loss": 0.6749, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "grad_norm": 2.4779341220855713, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "learning_rate": 0.00029348802392917305, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "loss": 0.5308644771575928, + "step": 2910 + }, + { + "ce_loss": 0.2553784251213074, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "distill_loss": 0.12964846193790436, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "ref_ce_loss": 0.1454658955335617, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "loss": 0.7324144244194031, + "step": 2910 + }, + { + "ce_loss": 0.35473960638046265, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "distill_loss": 0.13062384724617004, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "ref_ce_loss": 0.16410627961158752, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "loss": 0.6389783620834351, + "step": 2910 + }, + { + "ce_loss": 0.2581387162208557, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "distill_loss": 0.11254259198904037, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "ref_ce_loss": 0.10073118656873703, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "loss": 0.8085727691650391, + "step": 2910 + }, + { + "ce_loss": 0.29789814352989197, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "distill_loss": 0.11773893237113953, + "epoch": 0.970647098065377, + "step": 2910 + }, + { + "epoch": 0.970647098065377, + "ref_ce_loss": 0.1585742086172104, + "step": 2910 + }, + { + "epoch": 0.9739826551034023, + "loss": 0.689, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "grad_norm": 2.5197560787200928, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "learning_rate": 0.0002934288570940153, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "loss": 0.7948296070098877, + "step": 2920 + }, + { + "ce_loss": 0.1193004623055458, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "distill_loss": 0.11356161534786224, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "ref_ce_loss": 0.10010401904582977, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "loss": 0.8971014022827148, + "step": 2920 + }, + { + "ce_loss": 0.4552440345287323, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "distill_loss": 0.16932310163974762, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "ref_ce_loss": 0.189690962433815, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "loss": 1.1074857711791992, + "step": 2920 + }, + { + "ce_loss": 0.2667383551597595, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "distill_loss": 0.10355770587921143, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "ref_ce_loss": 0.1737743765115738, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "loss": 0.5775800943374634, + "step": 2920 + }, + { + "ce_loss": 0.12394880503416061, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "distill_loss": 0.09791535139083862, + "epoch": 0.9739826551034023, + "step": 2920 + }, + { + "epoch": 0.9739826551034023, + "ref_ce_loss": 0.1486055999994278, + "step": 2920 + }, + { + "epoch": 0.9773182121414277, + "loss": 0.6744, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "grad_norm": 3.1562507152557373, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "learning_rate": 0.00029336942870120033, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "loss": 1.3249276876449585, + "step": 2930 + }, + { + "ce_loss": 0.3045770823955536, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "distill_loss": 0.12404598295688629, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "ref_ce_loss": 0.15918442606925964, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "loss": 0.7175264954566956, + "step": 2930 + }, + { + "ce_loss": 0.19825127720832825, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "distill_loss": 0.1021764725446701, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "ref_ce_loss": 0.16720989346504211, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "loss": 0.648270845413208, + "step": 2930 + }, + { + "ce_loss": 0.15907418727874756, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "distill_loss": 0.09872958809137344, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "ref_ce_loss": 0.13381551206111908, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "loss": 0.8462702631950378, + "step": 2930 + }, + { + "ce_loss": 0.4205211400985718, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "distill_loss": 0.1454874575138092, + "epoch": 0.9773182121414277, + "step": 2930 + }, + { + "epoch": 0.9773182121414277, + "ref_ce_loss": 0.1591566801071167, + "step": 2930 + }, + { + "epoch": 0.980653769179453, + "loss": 0.6945, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "grad_norm": 2.639544725418091, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "learning_rate": 0.000293309738859102, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "loss": 0.4652785658836365, + "step": 2940 + }, + { + "ce_loss": 0.17041869461536407, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "distill_loss": 0.12091349810361862, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "ref_ce_loss": 0.12496335804462433, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "loss": 0.6894442439079285, + "step": 2940 + }, + { + "ce_loss": 0.19380292296409607, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "distill_loss": 0.12530314922332764, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "ref_ce_loss": 0.14851728081703186, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "loss": 0.523069441318512, + "step": 2940 + }, + { + "ce_loss": 0.1773136556148529, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "distill_loss": 0.12015138566493988, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "ref_ce_loss": 0.11531037092208862, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "loss": 0.6160759329795837, + "step": 2940 + }, + { + "ce_loss": 0.20735104382038116, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "distill_loss": 0.13009101152420044, + "epoch": 0.980653769179453, + "step": 2940 + }, + { + "epoch": 0.980653769179453, + "ref_ce_loss": 0.13989074528217316, + "step": 2940 + }, + { + "epoch": 0.9839893262174784, + "loss": 1.0021, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "grad_norm": 2.5855627059936523, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "learning_rate": 0.0002932497876765711, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "loss": 0.606486976146698, + "step": 2950 + }, + { + "ce_loss": 0.22377228736877441, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "distill_loss": 0.2075396329164505, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "ref_ce_loss": 0.10233394056558609, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "loss": 0.5983362197875977, + "step": 2950 + }, + { + "ce_loss": 0.18260978162288666, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "distill_loss": 0.1780635416507721, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "ref_ce_loss": 0.08419310301542282, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "loss": 0.7534973621368408, + "step": 2950 + }, + { + "ce_loss": 0.21013881266117096, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "distill_loss": 0.24599647521972656, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "ref_ce_loss": 0.19525231420993805, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "loss": 1.276760458946228, + "step": 2950 + }, + { + "ce_loss": 0.34345290064811707, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "distill_loss": 0.2109031230211258, + "epoch": 0.9839893262174784, + "step": 2950 + }, + { + "epoch": 0.9839893262174784, + "ref_ce_loss": 0.21050138771533966, + "step": 2950 + }, + { + "epoch": 0.9873248832555037, + "loss": 0.7315, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "grad_norm": 2.1952788829803467, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "learning_rate": 0.0002931895752629349, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "loss": 0.5843423008918762, + "step": 2960 + }, + { + "ce_loss": 0.2293442189693451, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "distill_loss": 0.13252869248390198, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "ref_ce_loss": 0.1581684648990631, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "loss": 0.7649279236793518, + "step": 2960 + }, + { + "ce_loss": 0.35924723744392395, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "distill_loss": 0.14280739426612854, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "ref_ce_loss": 0.21604615449905396, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "loss": 0.6165916919708252, + "step": 2960 + }, + { + "ce_loss": 0.29689690470695496, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "distill_loss": 0.12747378647327423, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "ref_ce_loss": 0.14126361906528473, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "loss": 0.4910457134246826, + "step": 2960 + }, + { + "ce_loss": 0.20109711587429047, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "distill_loss": 0.13267558813095093, + "epoch": 0.9873248832555037, + "step": 2960 + }, + { + "epoch": 0.9873248832555037, + "ref_ce_loss": 0.08610528707504272, + "step": 2960 + }, + { + "epoch": 0.9906604402935291, + "loss": 0.6879, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "grad_norm": 5.9090895652771, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "learning_rate": 0.0002931291017279971, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "loss": 1.4796576499938965, + "step": 2970 + }, + { + "ce_loss": 0.25391554832458496, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "distill_loss": 0.12180805951356888, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "ref_ce_loss": 0.1687461882829666, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "loss": 0.6464487314224243, + "step": 2970 + }, + { + "ce_loss": 0.3380550146102905, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "distill_loss": 0.11552728712558746, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "ref_ce_loss": 0.19248206913471222, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "loss": 0.4546283781528473, + "step": 2970 + }, + { + "ce_loss": 0.1595580279827118, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "distill_loss": 0.0903565064072609, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "ref_ce_loss": 0.14584065973758698, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "loss": 0.5977323055267334, + "step": 2970 + }, + { + "ce_loss": 0.22186864912509918, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "distill_loss": 0.12966331839561462, + "epoch": 0.9906604402935291, + "step": 2970 + }, + { + "epoch": 0.9906604402935291, + "ref_ce_loss": 0.1510908603668213, + "step": 2970 + }, + { + "epoch": 0.9939959973315544, + "loss": 0.7359, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "grad_norm": 2.193732500076294, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "learning_rate": 0.00029306836718203755, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "loss": 0.9112476110458374, + "step": 2980 + }, + { + "ce_loss": 0.35679250955581665, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "distill_loss": 0.13653014600276947, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "ref_ce_loss": 0.18527543544769287, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "loss": 0.8198769688606262, + "step": 2980 + }, + { + "ce_loss": 0.37211382389068604, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "distill_loss": 0.12967464327812195, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "ref_ce_loss": 0.24657918512821198, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "loss": 0.6387631297111511, + "step": 2980 + }, + { + "ce_loss": 0.2982257902622223, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "distill_loss": 0.11798281222581863, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "ref_ce_loss": 0.13815459609031677, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "loss": 1.0164731740951538, + "step": 2980 + }, + { + "ce_loss": 0.17050199210643768, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "distill_loss": 0.1035347655415535, + "epoch": 0.9939959973315544, + "step": 2980 + }, + { + "epoch": 0.9939959973315544, + "ref_ce_loss": 0.1252170354127884, + "step": 2980 + }, + { + "epoch": 0.9973315543695798, + "loss": 0.7284, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "grad_norm": 7.909195423126221, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "learning_rate": 0.00029300737173581213, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "loss": 0.625489354133606, + "step": 2990 + }, + { + "ce_loss": 0.1594225913286209, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "distill_loss": 0.2516362965106964, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "ref_ce_loss": 0.17256700992584229, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "loss": 0.8924147486686707, + "step": 2990 + }, + { + "ce_loss": 0.2955637276172638, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "distill_loss": 0.3669006824493408, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "ref_ce_loss": 0.1760740727186203, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "loss": 1.2954351902008057, + "step": 2990 + }, + { + "ce_loss": 0.3024890720844269, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "distill_loss": 0.37931498885154724, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "ref_ce_loss": 0.16591976583003998, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "loss": 0.7697754502296448, + "step": 2990 + }, + { + "ce_loss": 0.24155797064304352, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "distill_loss": 0.3250412940979004, + "epoch": 0.9973315543695798, + "step": 2990 + }, + { + "epoch": 0.9973315543695798, + "ref_ce_loss": 0.12637540698051453, + "step": 2990 + }, + { + "epoch": 1.0006671114076051, + "loss": 0.9277, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "grad_norm": 2.1417627334594727, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "learning_rate": 0.0002929461155005525, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "loss": 0.9282156825065613, + "step": 3000 + }, + { + "ce_loss": 0.31524181365966797, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "distill_loss": 0.38363519310951233, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "ref_ce_loss": 0.16468551754951477, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "loss": 0.5853087902069092, + "step": 3000 + }, + { + "ce_loss": 0.1501854807138443, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "distill_loss": 0.2998085916042328, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "ref_ce_loss": 0.09147650003433228, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "loss": 0.6547336578369141, + "step": 3000 + }, + { + "ce_loss": 0.15813229978084564, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "distill_loss": 0.31622615456581116, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "ref_ce_loss": 0.08632109314203262, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "loss": 1.010650873184204, + "step": 3000 + }, + { + "ce_loss": 0.2512657642364502, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "distill_loss": 0.33857262134552, + "epoch": 1.0006671114076051, + "step": 3000 + }, + { + "epoch": 1.0006671114076051, + "ref_ce_loss": 0.13122253119945526, + "step": 3000 + }, + { + "epoch": 1.0040026684456305, + "loss": 0.7838, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "grad_norm": 4.269802093505859, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "learning_rate": 0.0002928845985879658, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "loss": 0.9392971992492676, + "step": 3010 + }, + { + "ce_loss": 0.2948865592479706, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "distill_loss": 0.2655756175518036, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "ref_ce_loss": 0.14558996260166168, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "loss": 0.7004539966583252, + "step": 3010 + }, + { + "ce_loss": 0.18756701052188873, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "distill_loss": 0.20628270506858826, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "ref_ce_loss": 0.12921027839183807, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "loss": 0.6955971717834473, + "step": 3010 + }, + { + "ce_loss": 0.24592086672782898, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "distill_loss": 0.21004171669483185, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "ref_ce_loss": 0.12992355227470398, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "loss": 0.8428363800048828, + "step": 3010 + }, + { + "ce_loss": 0.12708482146263123, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "distill_loss": 0.1839914321899414, + "epoch": 1.0040026684456305, + "step": 3010 + }, + { + "epoch": 1.0040026684456305, + "ref_ce_loss": 0.11820346862077713, + "step": 3010 + }, + { + "epoch": 1.0073382254836558, + "loss": 0.7617, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "grad_norm": 3.130213499069214, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "learning_rate": 0.00029282282111023464, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "loss": 0.7784039974212646, + "step": 3020 + }, + { + "ce_loss": 0.30632612109184265, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "distill_loss": 0.2653581500053406, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "ref_ce_loss": 0.20647765696048737, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "loss": 1.3872511386871338, + "step": 3020 + }, + { + "ce_loss": 0.5806623101234436, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "distill_loss": 0.27128076553344727, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "ref_ce_loss": 0.3675278425216675, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "loss": 0.5582769513130188, + "step": 3020 + }, + { + "ce_loss": 0.1799483597278595, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "distill_loss": 0.2470041662454605, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "ref_ce_loss": 0.13095030188560486, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "loss": 1.0463906526565552, + "step": 3020 + }, + { + "ce_loss": 0.2606077492237091, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "distill_loss": 0.26832517981529236, + "epoch": 1.0073382254836558, + "step": 3020 + }, + { + "epoch": 1.0073382254836558, + "ref_ce_loss": 0.12840500473976135, + "step": 3020 + }, + { + "epoch": 1.0106737825216812, + "loss": 0.8325, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "grad_norm": 4.799256801605225, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "learning_rate": 0.00029276078318001686, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "loss": 0.551561713218689, + "step": 3030 + }, + { + "ce_loss": 0.16658858954906464, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "distill_loss": 0.18139103055000305, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "ref_ce_loss": 0.11683313548564911, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "loss": 0.7683470845222473, + "step": 3030 + }, + { + "ce_loss": 0.29117369651794434, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "distill_loss": 0.19345280528068542, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "ref_ce_loss": 0.1394084095954895, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "loss": 0.5669423341751099, + "step": 3030 + }, + { + "ce_loss": 0.23442049324512482, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "distill_loss": 0.15671990811824799, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "ref_ce_loss": 0.10465249419212341, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "loss": 0.8321839570999146, + "step": 3030 + }, + { + "ce_loss": 0.20825815200805664, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "distill_loss": 0.16022738814353943, + "epoch": 1.0106737825216812, + "step": 3030 + }, + { + "epoch": 1.0106737825216812, + "ref_ce_loss": 0.13101598620414734, + "step": 3030 + }, + { + "epoch": 1.0140093395597065, + "loss": 0.6702, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "grad_norm": 5.037836074829102, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "learning_rate": 0.000292698484910445, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "loss": 0.5145962238311768, + "step": 3040 + }, + { + "ce_loss": 0.21563299000263214, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "distill_loss": 0.12810231745243073, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "ref_ce_loss": 0.12446434050798416, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "loss": 0.8015331029891968, + "step": 3040 + }, + { + "ce_loss": 0.29993003606796265, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "distill_loss": 0.15203720331192017, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "ref_ce_loss": 0.1160355657339096, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "loss": 0.5979145765304565, + "step": 3040 + }, + { + "ce_loss": 0.16789746284484863, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "distill_loss": 0.11229067295789719, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "ref_ce_loss": 0.08903674781322479, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "loss": 1.0836822986602783, + "step": 3040 + }, + { + "ce_loss": 0.22605106234550476, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "distill_loss": 0.15512679517269135, + "epoch": 1.0140093395597065, + "step": 3040 + }, + { + "epoch": 1.0140093395597065, + "ref_ce_loss": 0.11995731294155121, + "step": 3040 + }, + { + "epoch": 1.0173448965977319, + "loss": 0.7133, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "grad_norm": 1.9387662410736084, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "learning_rate": 0.0002926359264151267, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "loss": 0.5229524374008179, + "step": 3050 + }, + { + "ce_loss": 0.2520233392715454, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "distill_loss": 0.11001858115196228, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "ref_ce_loss": 0.16077430546283722, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "loss": 0.7243987917900085, + "step": 3050 + }, + { + "ce_loss": 0.27749699354171753, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "distill_loss": 0.14459560811519623, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "ref_ce_loss": 0.17567412555217743, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "loss": 0.40805763006210327, + "step": 3050 + }, + { + "ce_loss": 0.15319980680942535, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "distill_loss": 0.0772315114736557, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "ref_ce_loss": 0.11021654307842255, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "loss": 0.44821786880493164, + "step": 3050 + }, + { + "ce_loss": 0.2088594138622284, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "distill_loss": 0.11182739585638046, + "epoch": 1.0173448965977319, + "step": 3050 + }, + { + "epoch": 1.0173448965977319, + "ref_ce_loss": 0.1273898184299469, + "step": 3050 + }, + { + "epoch": 1.0206804536357572, + "loss": 0.7103, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "grad_norm": 3.4637560844421387, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "learning_rate": 0.00029257310780814383, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "loss": 0.6055834889411926, + "step": 3060 + }, + { + "ce_loss": 0.24962303042411804, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "distill_loss": 0.14352454245090485, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "ref_ce_loss": 0.1418440341949463, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "loss": 0.577202558517456, + "step": 3060 + }, + { + "ce_loss": 0.2429414987564087, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "distill_loss": 0.16753113269805908, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "ref_ce_loss": 0.10312572866678238, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "loss": 1.0788899660110474, + "step": 3060 + }, + { + "ce_loss": 0.1864338368177414, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "distill_loss": 0.15208856761455536, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "ref_ce_loss": 0.1461482048034668, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "loss": 0.6381796598434448, + "step": 3060 + }, + { + "ce_loss": 0.23099523782730103, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "distill_loss": 0.15411165356636047, + "epoch": 1.0206804536357572, + "step": 3060 + }, + { + "epoch": 1.0206804536357572, + "ref_ce_loss": 0.17182599008083344, + "step": 3060 + }, + { + "epoch": 1.0240160106737826, + "loss": 0.6631, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "grad_norm": 3.612990140914917, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "learning_rate": 0.00029251002920405286, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "loss": 0.4682758152484894, + "step": 3070 + }, + { + "ce_loss": 0.15293414890766144, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "distill_loss": 0.13002100586891174, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "ref_ce_loss": 0.10538853704929352, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "loss": 0.580215573310852, + "step": 3070 + }, + { + "ce_loss": 0.21724647283554077, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "distill_loss": 0.17224983870983124, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "ref_ce_loss": 0.11003424972295761, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "loss": 0.9422988295555115, + "step": 3070 + }, + { + "ce_loss": 0.20494846999645233, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "distill_loss": 0.149722620844841, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "ref_ce_loss": 0.1599912941455841, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "loss": 0.7094449996948242, + "step": 3070 + }, + { + "ce_loss": 0.20244437456130981, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "distill_loss": 0.11915453523397446, + "epoch": 1.0240160106737826, + "step": 3070 + }, + { + "epoch": 1.0240160106737826, + "ref_ce_loss": 0.09864697605371475, + "step": 3070 + }, + { + "epoch": 1.027351567711808, + "loss": 0.7187, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "grad_norm": 3.3055739402770996, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "learning_rate": 0.0002924466907178842, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "loss": 0.6497579216957092, + "step": 3080 + }, + { + "ce_loss": 0.14034055173397064, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "distill_loss": 0.1890394389629364, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "ref_ce_loss": 0.15432484447956085, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "loss": 0.9005087614059448, + "step": 3080 + }, + { + "ce_loss": 0.30830201506614685, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "distill_loss": 0.30596300959587097, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "ref_ce_loss": 0.1606854796409607, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "loss": 0.530880331993103, + "step": 3080 + }, + { + "ce_loss": 0.14225921034812927, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "distill_loss": 0.17800264060497284, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "ref_ce_loss": 0.08136451244354248, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "loss": 0.6551678776741028, + "step": 3080 + }, + { + "ce_loss": 0.19130204617977142, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "distill_loss": 0.2622304856777191, + "epoch": 1.027351567711808, + "step": 3080 + }, + { + "epoch": 1.027351567711808, + "ref_ce_loss": 0.16420768201351166, + "step": 3080 + }, + { + "epoch": 1.0306871247498333, + "loss": 0.724, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "grad_norm": 3.060091018676758, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "learning_rate": 0.0002923830924651424, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "loss": 0.6151540875434875, + "step": 3090 + }, + { + "ce_loss": 0.25476041436195374, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "distill_loss": 0.13451892137527466, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "ref_ce_loss": 0.18273842334747314, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "loss": 0.8939247727394104, + "step": 3090 + }, + { + "ce_loss": 0.32922524213790894, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "distill_loss": 0.14805200695991516, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "ref_ce_loss": 0.1756051629781723, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "loss": 0.8971487283706665, + "step": 3090 + }, + { + "ce_loss": 0.27347028255462646, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "distill_loss": 0.12306328862905502, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "ref_ce_loss": 0.23444783687591553, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "loss": 0.769548773765564, + "step": 3090 + }, + { + "ce_loss": 0.2549266815185547, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "distill_loss": 0.15688112378120422, + "epoch": 1.0306871247498333, + "step": 3090 + }, + { + "epoch": 1.0306871247498333, + "ref_ce_loss": 0.2493422031402588, + "step": 3090 + }, + { + "epoch": 1.0340226817878586, + "loss": 0.8049, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "grad_norm": 2.9161508083343506, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "learning_rate": 0.0002923192345618054, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "loss": 1.0467815399169922, + "step": 3100 + }, + { + "ce_loss": 0.33018168807029724, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "distill_loss": 0.22280463576316833, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "ref_ce_loss": 0.17719537019729614, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "loss": 0.6442136168479919, + "step": 3100 + }, + { + "ce_loss": 0.23328647017478943, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "distill_loss": 0.2879638075828552, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "ref_ce_loss": 0.12265841662883759, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "loss": 0.6296390891075134, + "step": 3100 + }, + { + "ce_loss": 0.1833505630493164, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "distill_loss": 0.30275848507881165, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "ref_ce_loss": 0.09682515263557434, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "loss": 0.6476273536682129, + "step": 3100 + }, + { + "ce_loss": 0.22235122323036194, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "distill_loss": 0.25921764969825745, + "epoch": 1.0340226817878586, + "step": 3100 + }, + { + "epoch": 1.0340226817878586, + "ref_ce_loss": 0.12296784669160843, + "step": 3100 + }, + { + "epoch": 1.037358238825884, + "loss": 0.7384, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "grad_norm": 4.091484546661377, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "learning_rate": 0.00029225511712432494, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "loss": 0.8834012746810913, + "step": 3110 + }, + { + "ce_loss": 0.28296226263046265, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "distill_loss": 0.2384410798549652, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "ref_ce_loss": 0.12611015141010284, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "loss": 0.5011523962020874, + "step": 3110 + }, + { + "ce_loss": 0.16393588483333588, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "distill_loss": 0.1573915183544159, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "ref_ce_loss": 0.09610199183225632, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "loss": 0.7043334245681763, + "step": 3110 + }, + { + "ce_loss": 0.2144857943058014, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "distill_loss": 0.1674727201461792, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "ref_ce_loss": 0.15289588272571564, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "loss": 0.5016894340515137, + "step": 3110 + }, + { + "ce_loss": 0.17593710124492645, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "distill_loss": 0.20370127260684967, + "epoch": 1.037358238825884, + "step": 3110 + }, + { + "epoch": 1.037358238825884, + "ref_ce_loss": 0.06827589869499207, + "step": 3110 + }, + { + "epoch": 1.0406937958639093, + "loss": 0.736, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "grad_norm": 2.3639414310455322, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "learning_rate": 0.0002921907402696259, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "loss": 0.8598054647445679, + "step": 3120 + }, + { + "ce_loss": 0.24055452644824982, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "distill_loss": 0.20108632743358612, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "ref_ce_loss": 0.13357633352279663, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "loss": 0.5927767753601074, + "step": 3120 + }, + { + "ce_loss": 0.16187889873981476, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "distill_loss": 0.21812832355499268, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "ref_ce_loss": 0.1286218911409378, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "loss": 0.6704609394073486, + "step": 3120 + }, + { + "ce_loss": 0.24853084981441498, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "distill_loss": 0.2065722942352295, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "ref_ce_loss": 0.14731864631175995, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "loss": 1.198374629020691, + "step": 3120 + }, + { + "ce_loss": 0.2626439034938812, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "distill_loss": 0.23086431622505188, + "epoch": 1.0406937958639093, + "step": 3120 + }, + { + "epoch": 1.0406937958639093, + "ref_ce_loss": 0.2478064000606537, + "step": 3120 + }, + { + "epoch": 1.0440293529019347, + "loss": 0.7584, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "grad_norm": 4.350533962249756, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "learning_rate": 0.00029212610411510627, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "loss": 0.5243332386016846, + "step": 3130 + }, + { + "ce_loss": 0.11996088922023773, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "distill_loss": 0.16550423204898834, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "ref_ce_loss": 0.07062079012393951, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "loss": 1.1567490100860596, + "step": 3130 + }, + { + "ce_loss": 0.1746305376291275, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "distill_loss": 0.2075626105070114, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "ref_ce_loss": 0.08206846565008163, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "loss": 0.6579044461250305, + "step": 3130 + }, + { + "ce_loss": 0.22023993730545044, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "distill_loss": 0.2082604169845581, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "ref_ce_loss": 0.11965802311897278, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "loss": 0.7400968074798584, + "step": 3130 + }, + { + "ce_loss": 0.26753515005111694, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "distill_loss": 0.2450755536556244, + "epoch": 1.0440293529019347, + "step": 3130 + }, + { + "epoch": 1.0440293529019347, + "ref_ce_loss": 0.1641095131635666, + "step": 3130 + }, + { + "epoch": 1.04736490993996, + "loss": 0.7619, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "grad_norm": 2.58843994140625, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "learning_rate": 0.000292061208778637, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "loss": 0.455960750579834, + "step": 3140 + }, + { + "ce_loss": 0.09826745092868805, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "distill_loss": 0.1610691100358963, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "ref_ce_loss": 0.057575829327106476, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "loss": 0.8150807619094849, + "step": 3140 + }, + { + "ce_loss": 0.26749011874198914, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "distill_loss": 0.205758199095726, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "ref_ce_loss": 0.17441095411777496, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "loss": 1.1181557178497314, + "step": 3140 + }, + { + "ce_loss": 0.39353179931640625, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "distill_loss": 0.24632498621940613, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "ref_ce_loss": 0.1923367828130722, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "loss": 0.9463489651679993, + "step": 3140 + }, + { + "ce_loss": 0.25722455978393555, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "distill_loss": 0.18198177218437195, + "epoch": 1.04736490993996, + "step": 3140 + }, + { + "epoch": 1.04736490993996, + "ref_ce_loss": 0.1510559469461441, + "step": 3140 + }, + { + "epoch": 1.0507004669779854, + "loss": 0.6906, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "grad_norm": 1.8357528448104858, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "learning_rate": 0.0002919960543785614, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "loss": 1.3078269958496094, + "step": 3150 + }, + { + "ce_loss": 0.3390888571739197, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "distill_loss": 0.16615071892738342, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "ref_ce_loss": 0.21896415948867798, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "loss": 0.8389450311660767, + "step": 3150 + }, + { + "ce_loss": 0.2977851331233978, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "distill_loss": 0.14731603860855103, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "ref_ce_loss": 0.15756700932979584, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "loss": 0.5830703973770142, + "step": 3150 + }, + { + "ce_loss": 0.26267823576927185, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "distill_loss": 0.15796950459480286, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "ref_ce_loss": 0.10677004605531693, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "loss": 0.7216756939888, + "step": 3150 + }, + { + "ce_loss": 0.299459844827652, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "distill_loss": 0.1527896672487259, + "epoch": 1.0507004669779854, + "step": 3150 + }, + { + "epoch": 1.0507004669779854, + "ref_ce_loss": 0.13475990295410156, + "step": 3150 + }, + { + "epoch": 1.0540360240160107, + "loss": 0.7631, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "grad_norm": 1.8830387592315674, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "learning_rate": 0.00029193064103369545, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "loss": 0.47316044569015503, + "step": 3160 + }, + { + "ce_loss": 0.13927239179611206, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "distill_loss": 0.1530410349369049, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "ref_ce_loss": 0.12716428935527802, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "loss": 0.6238396763801575, + "step": 3160 + }, + { + "ce_loss": 0.1877501904964447, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "distill_loss": 0.16471980512142181, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "ref_ce_loss": 0.13325060904026031, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "loss": 0.7353872656822205, + "step": 3160 + }, + { + "ce_loss": 0.23863466084003448, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "distill_loss": 0.17457962036132812, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "ref_ce_loss": 0.10836151987314224, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "loss": 0.7750897407531738, + "step": 3160 + }, + { + "ce_loss": 0.2611212730407715, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "distill_loss": 0.15589670836925507, + "epoch": 1.0540360240160107, + "step": 3160 + }, + { + "epoch": 1.0540360240160107, + "ref_ce_loss": 0.1313047707080841, + "step": 3160 + }, + { + "epoch": 1.057371581054036, + "loss": 0.7822, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "grad_norm": 3.618147611618042, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "learning_rate": 0.00029186496886332737, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "loss": 0.8868449926376343, + "step": 3170 + }, + { + "ce_loss": 0.14827127754688263, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "distill_loss": 0.18400876224040985, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "ref_ce_loss": 0.09630703181028366, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "loss": 1.0973975658416748, + "step": 3170 + }, + { + "ce_loss": 0.3045180141925812, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "distill_loss": 0.2753039598464966, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "ref_ce_loss": 0.17682716250419617, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "loss": 0.8529131412506104, + "step": 3170 + }, + { + "ce_loss": 0.19607514142990112, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "distill_loss": 0.29867586493492126, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "ref_ce_loss": 0.12237987667322159, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "loss": 1.0622225999832153, + "step": 3170 + }, + { + "ce_loss": 0.17475342750549316, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "distill_loss": 0.2065230756998062, + "epoch": 1.057371581054036, + "step": 3170 + }, + { + "epoch": 1.057371581054036, + "ref_ce_loss": 0.12568923830986023, + "step": 3170 + }, + { + "epoch": 1.0607071380920614, + "loss": 0.7855, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "grad_norm": 3.1847946643829346, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "learning_rate": 0.0002917990379872173, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "loss": 0.8633472323417664, + "step": 3180 + }, + { + "ce_loss": 0.2781065106391907, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "distill_loss": 0.26735353469848633, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "ref_ce_loss": 0.11382268369197845, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "loss": 0.6711214184761047, + "step": 3180 + }, + { + "ce_loss": 0.2781183421611786, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "distill_loss": 0.1842014491558075, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "ref_ce_loss": 0.1488085687160492, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "loss": 0.6630605459213257, + "step": 3180 + }, + { + "ce_loss": 0.1706310361623764, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "distill_loss": 0.24702239036560059, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "ref_ce_loss": 0.09450356662273407, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "loss": 0.5658689737319946, + "step": 3180 + }, + { + "ce_loss": 0.25082194805145264, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "distill_loss": 0.1784793734550476, + "epoch": 1.0607071380920614, + "step": 3180 + }, + { + "epoch": 1.0607071380920614, + "ref_ce_loss": 0.13623644411563873, + "step": 3180 + }, + { + "epoch": 1.0640426951300868, + "loss": 0.7828, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "grad_norm": 3.540067195892334, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "learning_rate": 0.000291732848525597, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "loss": 1.3457376956939697, + "step": 3190 + }, + { + "ce_loss": 0.28505170345306396, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "distill_loss": 0.24294057488441467, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "ref_ce_loss": 0.13038063049316406, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "loss": 0.4256614148616791, + "step": 3190 + }, + { + "ce_loss": 0.1152573972940445, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "distill_loss": 0.17031772434711456, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "ref_ce_loss": 0.08523818105459213, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "loss": 0.9928364157676697, + "step": 3190 + }, + { + "ce_loss": 0.2940632700920105, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "distill_loss": 0.26018041372299194, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "ref_ce_loss": 0.14488235116004944, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "loss": 0.7044861912727356, + "step": 3190 + }, + { + "ce_loss": 0.2863961458206177, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "distill_loss": 0.22382394969463348, + "epoch": 1.0640426951300868, + "step": 3190 + }, + { + "epoch": 1.0640426951300868, + "ref_ce_loss": 0.13676100969314575, + "step": 3190 + }, + { + "epoch": 1.067378252168112, + "loss": 0.7039, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "grad_norm": 2.542185068130493, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "learning_rate": 0.0002916664005991701, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "loss": 1.3946433067321777, + "step": 3200 + }, + { + "ce_loss": 0.2782725393772125, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "distill_loss": 0.15828801691532135, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "ref_ce_loss": 0.2695305645465851, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "loss": 0.7213587164878845, + "step": 3200 + }, + { + "ce_loss": 0.22855627536773682, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "distill_loss": 0.16613143682479858, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "ref_ce_loss": 0.20623084902763367, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "loss": 0.5456353425979614, + "step": 3200 + }, + { + "ce_loss": 0.1833302229642868, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "distill_loss": 0.16412489116191864, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "ref_ce_loss": 0.1315329372882843, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "loss": 0.6673558950424194, + "step": 3200 + }, + { + "ce_loss": 0.27535635232925415, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "distill_loss": 0.18773838877677917, + "epoch": 1.067378252168112, + "step": 3200 + }, + { + "epoch": 1.067378252168112, + "ref_ce_loss": 0.1323918253183365, + "step": 3200 + }, + { + "epoch": 1.0707138092061375, + "loss": 0.6588, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "grad_norm": 3.2879350185394287, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "learning_rate": 0.0002915996943291114, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "loss": 0.47054553031921387, + "step": 3210 + }, + { + "ce_loss": 0.14522524178028107, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "distill_loss": 0.12368728220462799, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "ref_ce_loss": 0.06966076791286469, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "loss": 1.2660126686096191, + "step": 3210 + }, + { + "ce_loss": 0.26258736848831177, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "distill_loss": 0.1678849756717682, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "ref_ce_loss": 0.15247942507266998, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "loss": 0.617216944694519, + "step": 3210 + }, + { + "ce_loss": 0.29853564500808716, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "distill_loss": 0.1275399774312973, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "ref_ce_loss": 0.1393018513917923, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "loss": 0.6128079295158386, + "step": 3210 + }, + { + "ce_loss": 0.24983979761600494, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "distill_loss": 0.16351068019866943, + "epoch": 1.0707138092061375, + "step": 3210 + }, + { + "epoch": 1.0707138092061375, + "ref_ce_loss": 0.12650151550769806, + "step": 3210 + }, + { + "epoch": 1.0740493662441628, + "loss": 0.7102, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "grad_norm": 2.6898279190063477, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "learning_rate": 0.00029153272983706665, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "loss": 0.6622181534767151, + "step": 3220 + }, + { + "ce_loss": 0.25215262174606323, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "distill_loss": 0.1243286281824112, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "ref_ce_loss": 0.16581885516643524, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "loss": 0.7243108749389648, + "step": 3220 + }, + { + "ce_loss": 0.19404850900173187, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "distill_loss": 0.1466391682624817, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "ref_ce_loss": 0.10018964856863022, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "loss": 0.6774519681930542, + "step": 3220 + }, + { + "ce_loss": 0.12806348502635956, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "distill_loss": 0.09503288567066193, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "ref_ce_loss": 0.11790133267641068, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "loss": 0.8044633865356445, + "step": 3220 + }, + { + "ce_loss": 0.29243576526641846, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "distill_loss": 0.18098977208137512, + "epoch": 1.0740493662441628, + "step": 3220 + }, + { + "epoch": 1.0740493662441628, + "ref_ce_loss": 0.1330576390028, + "step": 3220 + }, + { + "epoch": 1.0773849232821882, + "loss": 0.7111, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "grad_norm": 2.036210298538208, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "learning_rate": 0.0002914655072451528, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "loss": 0.5417668223381042, + "step": 3230 + }, + { + "ce_loss": 0.1813638061285019, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "distill_loss": 0.13782474398612976, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "ref_ce_loss": 0.13959860801696777, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "loss": 0.5895418524742126, + "step": 3230 + }, + { + "ce_loss": 0.18671901524066925, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "distill_loss": 0.14473098516464233, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "ref_ce_loss": 0.17107507586479187, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "loss": 0.6597346067428589, + "step": 3230 + }, + { + "ce_loss": 0.26041436195373535, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "distill_loss": 0.13516613841056824, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "ref_ce_loss": 0.10818801075220108, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "loss": 1.3986637592315674, + "step": 3230 + }, + { + "ce_loss": 0.49624744057655334, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "distill_loss": 0.19390641152858734, + "epoch": 1.0773849232821882, + "step": 3230 + }, + { + "epoch": 1.0773849232821882, + "ref_ce_loss": 0.14032429456710815, + "step": 3230 + }, + { + "epoch": 1.0807204803202135, + "loss": 0.6783, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "grad_norm": 2.5386595726013184, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "learning_rate": 0.00029139802667595735, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "loss": 0.7183874249458313, + "step": 3240 + }, + { + "ce_loss": 0.34214258193969727, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "distill_loss": 0.14284975826740265, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "ref_ce_loss": 0.1737593561410904, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "loss": 0.6244215369224548, + "step": 3240 + }, + { + "ce_loss": 0.21345029771327972, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "distill_loss": 0.13486254215240479, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "ref_ce_loss": 0.10968282073736191, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "loss": 0.6682906150817871, + "step": 3240 + }, + { + "ce_loss": 0.12535721063613892, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "distill_loss": 0.09490034729242325, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "ref_ce_loss": 0.08199267834424973, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "loss": 0.7619924545288086, + "step": 3240 + }, + { + "ce_loss": 0.3022087812423706, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "distill_loss": 0.12522542476654053, + "epoch": 1.0807204803202135, + "step": 3240 + }, + { + "epoch": 1.0807204803202135, + "ref_ce_loss": 0.200872540473938, + "step": 3240 + }, + { + "epoch": 1.0840560373582389, + "loss": 0.649, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "grad_norm": 1.8706231117248535, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "learning_rate": 0.00029133028825253823, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "loss": 0.6613234281539917, + "step": 3250 + }, + { + "ce_loss": 0.32606416940689087, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "distill_loss": 0.1341378539800644, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "ref_ce_loss": 0.20065777003765106, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "loss": 0.7039586305618286, + "step": 3250 + }, + { + "ce_loss": 0.27694404125213623, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "distill_loss": 0.13455909490585327, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "ref_ce_loss": 0.22442147135734558, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "loss": 1.2980964183807373, + "step": 3250 + }, + { + "ce_loss": 0.3101694583892822, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "distill_loss": 0.12680265307426453, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "ref_ce_loss": 0.18077422678470612, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "loss": 0.6655110120773315, + "step": 3250 + }, + { + "ce_loss": 0.3538602292537689, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "distill_loss": 0.12952083349227905, + "epoch": 1.0840560373582389, + "step": 3250 + }, + { + "epoch": 1.0840560373582389, + "ref_ce_loss": 0.18190212547779083, + "step": 3250 + }, + { + "epoch": 1.0873915943962642, + "loss": 0.6585, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "grad_norm": 1.9900285005569458, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "learning_rate": 0.00029126229209842355, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "loss": 0.6257374882698059, + "step": 3260 + }, + { + "ce_loss": 0.21010935306549072, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "distill_loss": 0.15154899656772614, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "ref_ce_loss": 0.11518467962741852, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "loss": 0.5982180833816528, + "step": 3260 + }, + { + "ce_loss": 0.1900973916053772, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "distill_loss": 0.1584339737892151, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "ref_ce_loss": 0.09822672605514526, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "loss": 0.4718421697616577, + "step": 3260 + }, + { + "ce_loss": 0.16259068250656128, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "distill_loss": 0.15456894040107727, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "ref_ce_loss": 0.1545715481042862, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "loss": 0.6045008897781372, + "step": 3260 + }, + { + "ce_loss": 0.2306605726480484, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "distill_loss": 0.13497021794319153, + "epoch": 1.0873915943962642, + "step": 3260 + }, + { + "epoch": 1.0873915943962642, + "ref_ce_loss": 0.10883224755525589, + "step": 3260 + }, + { + "epoch": 1.0907271514342896, + "loss": 0.6426, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "grad_norm": 2.159749746322632, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "learning_rate": 0.0002911940383376115, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "loss": 0.6150444746017456, + "step": 3270 + }, + { + "ce_loss": 0.25478172302246094, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "distill_loss": 0.20271824300289154, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "ref_ce_loss": 0.15745176374912262, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "loss": 0.7662996053695679, + "step": 3270 + }, + { + "ce_loss": 0.22442622482776642, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "distill_loss": 0.1579366773366928, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "ref_ce_loss": 0.1851867437362671, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "loss": 0.6150156855583191, + "step": 3270 + }, + { + "ce_loss": 0.15814071893692017, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "distill_loss": 0.15518149733543396, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "ref_ce_loss": 0.10483012348413467, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "loss": 0.7023760676383972, + "step": 3270 + }, + { + "ce_loss": 0.2654307782649994, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "distill_loss": 0.13676705956459045, + "epoch": 1.0907271514342896, + "step": 3270 + }, + { + "epoch": 1.0907271514342896, + "ref_ce_loss": 0.12032781541347504, + "step": 3270 + }, + { + "epoch": 1.094062708472315, + "loss": 0.6668, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "grad_norm": 2.4643666744232178, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "learning_rate": 0.00029112552709457013, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "loss": 0.5930442810058594, + "step": 3280 + }, + { + "ce_loss": 0.20769517123699188, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "distill_loss": 0.1775941401720047, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "ref_ce_loss": 0.1543862670660019, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "loss": 0.845909059047699, + "step": 3280 + }, + { + "ce_loss": 0.18115189671516418, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "distill_loss": 0.14669832587242126, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "ref_ce_loss": 0.1459181308746338, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "loss": 0.8020659685134888, + "step": 3280 + }, + { + "ce_loss": 0.22992858290672302, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "distill_loss": 0.18241055309772491, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "ref_ce_loss": 0.16772453486919403, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "loss": 0.637089729309082, + "step": 3280 + }, + { + "ce_loss": 0.16514697670936584, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "distill_loss": 0.16328474879264832, + "epoch": 1.094062708472315, + "step": 3280 + }, + { + "epoch": 1.094062708472315, + "ref_ce_loss": 0.09549298137426376, + "step": 3280 + }, + { + "epoch": 1.0973982655103403, + "loss": 0.7284, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "grad_norm": 2.1445436477661133, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "learning_rate": 0.0002910567584942367, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "loss": 0.47283828258514404, + "step": 3290 + }, + { + "ce_loss": 0.18235227465629578, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "distill_loss": 0.18419858813285828, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "ref_ce_loss": 0.10618919879198074, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "loss": 0.9133608937263489, + "step": 3290 + }, + { + "ce_loss": 0.2051980197429657, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "distill_loss": 0.16253188252449036, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "ref_ce_loss": 0.14432208240032196, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "loss": 0.4479078948497772, + "step": 3290 + }, + { + "ce_loss": 0.17836260795593262, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "distill_loss": 0.16010338068008423, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "ref_ce_loss": 0.10920123010873795, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "loss": 0.8367888331413269, + "step": 3290 + }, + { + "ce_loss": 0.3513221740722656, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "distill_loss": 0.21813614666461945, + "epoch": 1.0973982655103403, + "step": 3290 + }, + { + "epoch": 1.0973982655103403, + "ref_ce_loss": 0.2671056389808655, + "step": 3290 + }, + { + "epoch": 1.1007338225483656, + "loss": 0.6781, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "grad_norm": 2.1739988327026367, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "learning_rate": 0.00029098773266201817, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "loss": 0.6908473968505859, + "step": 3300 + }, + { + "ce_loss": 0.1812887191772461, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "distill_loss": 0.1621406078338623, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "ref_ce_loss": 0.12778893113136292, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "loss": 0.4360514283180237, + "step": 3300 + }, + { + "ce_loss": 0.15996110439300537, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "distill_loss": 0.14277467131614685, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "ref_ce_loss": 0.13183438777923584, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "loss": 0.6263667345046997, + "step": 3300 + }, + { + "ce_loss": 0.23108765482902527, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "distill_loss": 0.16686061024665833, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "ref_ce_loss": 0.10735461115837097, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "loss": 0.47359082102775574, + "step": 3300 + }, + { + "ce_loss": 0.14491242170333862, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "distill_loss": 0.16404592990875244, + "epoch": 1.1007338225483656, + "step": 3300 + }, + { + "epoch": 1.1007338225483656, + "ref_ce_loss": 0.11639886349439621, + "step": 3300 + }, + { + "epoch": 1.104069379586391, + "loss": 0.6936, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "grad_norm": 2.9527342319488525, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "learning_rate": 0.00029091844972379036, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "loss": 0.5075188279151917, + "step": 3310 + }, + { + "ce_loss": 0.18285036087036133, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "distill_loss": 0.11507537215948105, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "ref_ce_loss": 0.14507053792476654, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "loss": 0.6437574028968811, + "step": 3310 + }, + { + "ce_loss": 0.3010771572589874, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "distill_loss": 0.1596427708864212, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "ref_ce_loss": 0.1829010546207428, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "loss": 0.79887855052948, + "step": 3310 + }, + { + "ce_loss": 0.3642679750919342, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "distill_loss": 0.13354647159576416, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "ref_ce_loss": 0.1479581743478775, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "loss": 0.6291710734367371, + "step": 3310 + }, + { + "ce_loss": 0.2528185546398163, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "distill_loss": 0.1336766481399536, + "epoch": 1.104069379586391, + "step": 3310 + }, + { + "epoch": 1.104069379586391, + "ref_ce_loss": 0.19675657153129578, + "step": 3310 + }, + { + "epoch": 1.1074049366244163, + "loss": 0.675, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "grad_norm": 2.2137176990509033, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "learning_rate": 0.00029084890980589806, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "loss": 0.4275200664997101, + "step": 3320 + }, + { + "ce_loss": 0.1844518780708313, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "distill_loss": 0.09774865210056305, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "ref_ce_loss": 0.1450997292995453, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "loss": 0.6636391878128052, + "step": 3320 + }, + { + "ce_loss": 0.2548641562461853, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "distill_loss": 0.10373241454362869, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "ref_ce_loss": 0.18854564428329468, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "loss": 0.4774409830570221, + "step": 3320 + }, + { + "ce_loss": 0.21203358471393585, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "distill_loss": 0.12920010089874268, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "ref_ce_loss": 0.13605579733848572, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "loss": 0.35023680329322815, + "step": 3320 + }, + { + "ce_loss": 0.1279526650905609, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "distill_loss": 0.09370873868465424, + "epoch": 1.1074049366244163, + "step": 3320 + }, + { + "epoch": 1.1074049366244163, + "ref_ce_loss": 0.12837746739387512, + "step": 3320 + }, + { + "epoch": 1.1107404936624417, + "loss": 0.7074, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "grad_norm": 4.328708171844482, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "learning_rate": 0.0002907791130351547, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "loss": 1.5637755393981934, + "step": 3330 + }, + { + "ce_loss": 0.16754838824272156, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "distill_loss": 0.2678470015525818, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "ref_ce_loss": 0.12202798575162888, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "loss": 0.9462152719497681, + "step": 3330 + }, + { + "ce_loss": 0.18462923169136047, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "distill_loss": 0.27984893321990967, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "ref_ce_loss": 0.158742293715477, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "loss": 0.7361347079277039, + "step": 3330 + }, + { + "ce_loss": 0.20012395083904266, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "distill_loss": 0.23684218525886536, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "ref_ce_loss": 0.18918201327323914, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "loss": 0.7664051055908203, + "step": 3330 + }, + { + "ce_loss": 0.28589898347854614, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "distill_loss": 0.3149957060813904, + "epoch": 1.1107404936624417, + "step": 3330 + }, + { + "epoch": 1.1107404936624417, + "ref_ce_loss": 0.1653788983821869, + "step": 3330 + }, + { + "epoch": 1.114076050700467, + "loss": 0.766, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "grad_norm": 3.208272933959961, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "learning_rate": 0.0002907090595388419, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "loss": 0.55037921667099, + "step": 3340 + }, + { + "ce_loss": 0.19427846372127533, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "distill_loss": 0.19319778680801392, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "ref_ce_loss": 0.16233539581298828, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "loss": 1.021225929260254, + "step": 3340 + }, + { + "ce_loss": 0.23068369925022125, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "distill_loss": 0.2348114550113678, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "ref_ce_loss": 0.1501733660697937, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "loss": 0.6855276823043823, + "step": 3340 + }, + { + "ce_loss": 0.2387414425611496, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "distill_loss": 0.2071400135755539, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "ref_ce_loss": 0.17829790711402893, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "loss": 1.3309962749481201, + "step": 3340 + }, + { + "ce_loss": 0.26556396484375, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "distill_loss": 0.2766760289669037, + "epoch": 1.114076050700467, + "step": 3340 + }, + { + "epoch": 1.114076050700467, + "ref_ce_loss": 0.11858808249235153, + "step": 3340 + }, + { + "epoch": 1.1174116077384924, + "loss": 0.722, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "grad_norm": 2.8225748538970947, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "learning_rate": 0.00029063874944470976, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "loss": 0.6001484990119934, + "step": 3350 + }, + { + "ce_loss": 0.16739514470100403, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "distill_loss": 0.12116310000419617, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "ref_ce_loss": 0.15140965580940247, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "loss": 0.6058046221733093, + "step": 3350 + }, + { + "ce_loss": 0.24347113072872162, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "distill_loss": 0.14185252785682678, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "ref_ce_loss": 0.15476146340370178, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "loss": 0.5068092346191406, + "step": 3350 + }, + { + "ce_loss": 0.20585039258003235, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "distill_loss": 0.13100898265838623, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "ref_ce_loss": 0.11952287703752518, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "loss": 0.6194139122962952, + "step": 3350 + }, + { + "ce_loss": 0.24892067909240723, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "distill_loss": 0.1232369989156723, + "epoch": 1.1174116077384924, + "step": 3350 + }, + { + "epoch": 1.1174116077384924, + "ref_ce_loss": 0.10848834365606308, + "step": 3350 + }, + { + "epoch": 1.1207471647765177, + "loss": 0.7006, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "grad_norm": 3.294583797454834, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "learning_rate": 0.00029056818288097604, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "loss": 0.3698619306087494, + "step": 3360 + }, + { + "ce_loss": 0.13874569535255432, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "distill_loss": 0.10796372592449188, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "ref_ce_loss": 0.12302211672067642, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "loss": 0.7495459318161011, + "step": 3360 + }, + { + "ce_loss": 0.24759386479854584, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "distill_loss": 0.1263791173696518, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "ref_ce_loss": 0.14187084138393402, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "loss": 0.5314804911613464, + "step": 3360 + }, + { + "ce_loss": 0.1609436273574829, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "distill_loss": 0.12793494760990143, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "ref_ce_loss": 0.11565695703029633, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "loss": 0.6234934329986572, + "step": 3360 + }, + { + "ce_loss": 0.25368544459342957, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "distill_loss": 0.1242745965719223, + "epoch": 1.1207471647765177, + "step": 3360 + }, + { + "epoch": 1.1207471647765177, + "ref_ce_loss": 0.12844549119472504, + "step": 3360 + }, + { + "epoch": 1.124082721814543, + "loss": 0.6384, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "grad_norm": 2.149684190750122, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "learning_rate": 0.0002904973599763264, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "loss": 0.8979804515838623, + "step": 3370 + }, + { + "ce_loss": 0.3260359764099121, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "distill_loss": 0.12811465561389923, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "ref_ce_loss": 0.1349736899137497, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "loss": 0.7722697257995605, + "step": 3370 + }, + { + "ce_loss": 0.16676156222820282, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "distill_loss": 0.11455924808979034, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "ref_ce_loss": 0.14846940338611603, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "loss": 0.7938320636749268, + "step": 3370 + }, + { + "ce_loss": 0.19437064230442047, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "distill_loss": 0.10303813964128494, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "ref_ce_loss": 0.1043003499507904, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "loss": 0.591460108757019, + "step": 3370 + }, + { + "ce_loss": 0.2839815318584442, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "distill_loss": 0.12387672066688538, + "epoch": 1.124082721814543, + "step": 3370 + }, + { + "epoch": 1.124082721814543, + "ref_ce_loss": 0.13258396089076996, + "step": 3370 + }, + { + "epoch": 1.1274182788525684, + "loss": 0.7276, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "grad_norm": 3.415515899658203, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "learning_rate": 0.0002904262808599138, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "loss": 0.6177815794944763, + "step": 3380 + }, + { + "ce_loss": 0.1780986338853836, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "distill_loss": 0.09284936636686325, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "ref_ce_loss": 0.15803013741970062, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "loss": 0.5742705464363098, + "step": 3380 + }, + { + "ce_loss": 0.22747960686683655, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "distill_loss": 0.08297803997993469, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "ref_ce_loss": 0.17674367129802704, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "loss": 0.27375152707099915, + "step": 3380 + }, + { + "ce_loss": 0.09724527597427368, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "distill_loss": 0.06766033172607422, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "ref_ce_loss": 0.1087183803319931, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "loss": 0.6211159229278564, + "step": 3380 + }, + { + "ce_loss": 0.27215591073036194, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "distill_loss": 0.08365193009376526, + "epoch": 1.1274182788525684, + "step": 3380 + }, + { + "epoch": 1.1274182788525684, + "ref_ce_loss": 0.19366669654846191, + "step": 3380 + }, + { + "epoch": 1.1307538358905938, + "loss": 0.5849, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "grad_norm": 2.1435344219207764, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "learning_rate": 0.0002903549456613586, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "loss": 0.570225179195404, + "step": 3390 + }, + { + "ce_loss": 0.24740993976593018, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "distill_loss": 0.12046276032924652, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "ref_ce_loss": 0.2011656016111374, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "loss": 0.645056962966919, + "step": 3390 + }, + { + "ce_loss": 0.2715945839881897, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "distill_loss": 0.14380735158920288, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "ref_ce_loss": 0.1408105045557022, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "loss": 0.6090614795684814, + "step": 3390 + }, + { + "ce_loss": 0.2522716224193573, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "distill_loss": 0.13276003301143646, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "ref_ce_loss": 0.16461263597011566, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "loss": 0.644291877746582, + "step": 3390 + }, + { + "ce_loss": 0.2850654423236847, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "distill_loss": 0.1500633805990219, + "epoch": 1.1307538358905938, + "step": 3390 + }, + { + "epoch": 1.1307538358905938, + "ref_ce_loss": 0.1433248519897461, + "step": 3390 + }, + { + "epoch": 1.134089392928619, + "loss": 0.6568, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "grad_norm": 3.319453477859497, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "learning_rate": 0.000290283354510748, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "loss": 0.6292673945426941, + "step": 3400 + }, + { + "ce_loss": 0.2306334227323532, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "distill_loss": 0.11475005745887756, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "ref_ce_loss": 0.10790764540433884, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "loss": 0.3763951063156128, + "step": 3400 + }, + { + "ce_loss": 0.11027084290981293, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "distill_loss": 0.13586261868476868, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "ref_ce_loss": 0.10516718029975891, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "loss": 0.5288718938827515, + "step": 3400 + }, + { + "ce_loss": 0.20292605459690094, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "distill_loss": 0.13888004422187805, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "ref_ce_loss": 0.10799039900302887, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "loss": 0.7232894897460938, + "step": 3400 + }, + { + "ce_loss": 0.24011795222759247, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "distill_loss": 0.13609902560710907, + "epoch": 1.134089392928619, + "step": 3400 + }, + { + "epoch": 1.134089392928619, + "ref_ce_loss": 0.1771697998046875, + "step": 3400 + }, + { + "epoch": 1.1374249499666444, + "loss": 0.689, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "grad_norm": 2.43742036819458, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "learning_rate": 0.00029021150753863614, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "loss": 0.47013843059539795, + "step": 3410 + }, + { + "ce_loss": 0.25350990891456604, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "distill_loss": 0.13056300580501556, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "ref_ce_loss": 0.08574430644512177, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "loss": 0.8182771801948547, + "step": 3410 + }, + { + "ce_loss": 0.2465936690568924, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "distill_loss": 0.1421625316143036, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "ref_ce_loss": 0.19648271799087524, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "loss": 0.429518461227417, + "step": 3410 + }, + { + "ce_loss": 0.16575780510902405, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "distill_loss": 0.11913955211639404, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "ref_ce_loss": 0.08452162146568298, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "loss": 0.5666241645812988, + "step": 3410 + }, + { + "ce_loss": 0.1754610687494278, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "distill_loss": 0.1132507249712944, + "epoch": 1.1374249499666444, + "step": 3410 + }, + { + "epoch": 1.1374249499666444, + "ref_ce_loss": 0.18586274981498718, + "step": 3410 + }, + { + "epoch": 1.1407605070046698, + "loss": 0.6657, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "grad_norm": 2.6489334106445312, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "learning_rate": 0.00029013940487604336, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "loss": 0.7012091875076294, + "step": 3420 + }, + { + "ce_loss": 0.319513738155365, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "distill_loss": 0.23633024096488953, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "ref_ce_loss": 0.14517493546009064, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "loss": 1.1926076412200928, + "step": 3420 + }, + { + "ce_loss": 0.28267160058021545, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "distill_loss": 0.182927668094635, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "ref_ce_loss": 0.11487416177988052, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "loss": 0.642974317073822, + "step": 3420 + }, + { + "ce_loss": 0.19311952590942383, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "distill_loss": 0.14313045144081116, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "ref_ce_loss": 0.16836071014404297, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "loss": 0.6440466046333313, + "step": 3420 + }, + { + "ce_loss": 0.2331516444683075, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "distill_loss": 0.1541604846715927, + "epoch": 1.1407605070046698, + "step": 3420 + }, + { + "epoch": 1.1407605070046698, + "ref_ce_loss": 0.16985993087291718, + "step": 3420 + }, + { + "epoch": 1.1440960640426951, + "loss": 0.8199, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "grad_norm": 2.4301552772521973, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "learning_rate": 0.00029006704665445653, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "loss": 0.7986155152320862, + "step": 3430 + }, + { + "ce_loss": 0.16914814710617065, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "distill_loss": 0.2899048626422882, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "ref_ce_loss": 0.14340253174304962, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "loss": 0.8709583282470703, + "step": 3430 + }, + { + "ce_loss": 0.23835349082946777, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "distill_loss": 0.3554132282733917, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "ref_ce_loss": 0.13039939105510712, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "loss": 0.6296640634536743, + "step": 3430 + }, + { + "ce_loss": 0.1245369091629982, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "distill_loss": 0.27871596813201904, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "ref_ce_loss": 0.11910228431224823, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "loss": 0.9232403039932251, + "step": 3430 + }, + { + "ce_loss": 0.13957470655441284, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "distill_loss": 0.36236271262168884, + "epoch": 1.1440960640426951, + "step": 3430 + }, + { + "epoch": 1.1440960640426951, + "ref_ce_loss": 0.12281995266675949, + "step": 3430 + }, + { + "epoch": 1.1474316210807205, + "loss": 0.7824, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "grad_norm": 2.6593689918518066, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "learning_rate": 0.00028999443300582864, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "loss": 0.6766294836997986, + "step": 3440 + }, + { + "ce_loss": 0.2637663185596466, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "distill_loss": 0.21465124189853668, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "ref_ce_loss": 0.14603792130947113, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "loss": 1.1934056282043457, + "step": 3440 + }, + { + "ce_loss": 0.3012538552284241, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "distill_loss": 0.19502484798431396, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "ref_ce_loss": 0.1340654343366623, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "loss": 0.9661704301834106, + "step": 3440 + }, + { + "ce_loss": 0.2947535812854767, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "distill_loss": 0.17559944093227386, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "ref_ce_loss": 0.18281979858875275, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "loss": 0.6069207191467285, + "step": 3440 + }, + { + "ce_loss": 0.2645988166332245, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "distill_loss": 0.1422264575958252, + "epoch": 1.1474316210807205, + "step": 3440 + }, + { + "epoch": 1.1474316210807205, + "ref_ce_loss": 0.200031116604805, + "step": 3440 + }, + { + "epoch": 1.1507671781187458, + "loss": 0.6981, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "grad_norm": 6.953259468078613, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "learning_rate": 0.0002899215640625782, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "loss": 0.36105650663375854, + "step": 3450 + }, + { + "ce_loss": 0.11932425200939178, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "distill_loss": 0.10572926700115204, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "ref_ce_loss": 0.09935622662305832, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "loss": 0.6997066736221313, + "step": 3450 + }, + { + "ce_loss": 0.22075653076171875, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "distill_loss": 0.15865163505077362, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "ref_ce_loss": 0.12915155291557312, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "loss": 0.6172825694084167, + "step": 3450 + }, + { + "ce_loss": 0.31567999720573425, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "distill_loss": 0.1347445249557495, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "ref_ce_loss": 0.16673196852207184, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "loss": 0.6522892117500305, + "step": 3450 + }, + { + "ce_loss": 0.3099440634250641, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "distill_loss": 0.12644734978675842, + "epoch": 1.1507671781187458, + "step": 3450 + }, + { + "epoch": 1.1507671781187458, + "ref_ce_loss": 0.13789407908916473, + "step": 3450 + }, + { + "epoch": 1.1541027351567712, + "loss": 0.6446, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "grad_norm": 2.391942262649536, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "learning_rate": 0.00028984843995758945, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "loss": 0.6947566270828247, + "step": 3460 + }, + { + "ce_loss": 0.23904870450496674, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "distill_loss": 0.1353689730167389, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "ref_ce_loss": 0.15380215644836426, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "loss": 0.42585551738739014, + "step": 3460 + }, + { + "ce_loss": 0.15706568956375122, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "distill_loss": 0.1261441856622696, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "ref_ce_loss": 0.14227433502674103, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "loss": 1.389416217803955, + "step": 3460 + }, + { + "ce_loss": 0.25617700815200806, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "distill_loss": 0.14031198620796204, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "ref_ce_loss": 0.15955011546611786, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "loss": 0.38896113634109497, + "step": 3460 + }, + { + "ce_loss": 0.17260347306728363, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "distill_loss": 0.11305748671293259, + "epoch": 1.1541027351567712, + "step": 3460 + }, + { + "epoch": 1.1541027351567712, + "ref_ce_loss": 0.10322257876396179, + "step": 3460 + }, + { + "epoch": 1.1574382921947965, + "loss": 0.616, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "grad_norm": 2.23102068901062, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "learning_rate": 0.0002897750608242119, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "loss": 0.5251259207725525, + "step": 3470 + }, + { + "ce_loss": 0.21399055421352386, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "distill_loss": 0.12265275418758392, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "ref_ce_loss": 0.1231919601559639, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "loss": 1.4573123455047607, + "step": 3470 + }, + { + "ce_loss": 0.46214139461517334, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "distill_loss": 0.15091703832149506, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "ref_ce_loss": 0.18401379883289337, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "loss": 0.4763915538787842, + "step": 3470 + }, + { + "ce_loss": 0.18862882256507874, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "distill_loss": 0.08673227578401566, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "ref_ce_loss": 0.10439599305391312, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "loss": 1.18876314163208, + "step": 3470 + }, + { + "ce_loss": 0.3029150664806366, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "distill_loss": 0.11280804872512817, + "epoch": 1.1574382921947965, + "step": 3470 + }, + { + "epoch": 1.1574382921947965, + "ref_ce_loss": 0.16956168413162231, + "step": 3470 + }, + { + "epoch": 1.160773849232822, + "loss": 0.678, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "grad_norm": 2.679671287536621, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "learning_rate": 0.00028970142679626024, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "loss": 0.4567927122116089, + "step": 3480 + }, + { + "ce_loss": 0.14957164227962494, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "distill_loss": 0.15318655967712402, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "ref_ce_loss": 0.10267200320959091, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "loss": 0.5889483094215393, + "step": 3480 + }, + { + "ce_loss": 0.21724718809127808, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "distill_loss": 0.1829308122396469, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "ref_ce_loss": 0.0946304053068161, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "loss": 0.9133307933807373, + "step": 3480 + }, + { + "ce_loss": 0.40971285104751587, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "distill_loss": 0.2387247383594513, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "ref_ce_loss": 0.21331532299518585, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "loss": 0.8961673974990845, + "step": 3480 + }, + { + "ce_loss": 0.2807924449443817, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "distill_loss": 0.22617176175117493, + "epoch": 1.160773849232822, + "step": 3480 + }, + { + "epoch": 1.160773849232822, + "ref_ce_loss": 0.16985855996608734, + "step": 3480 + }, + { + "epoch": 1.1641094062708472, + "loss": 0.63, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "grad_norm": 2.3964879512786865, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "learning_rate": 0.00028962753800801383, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "loss": 1.1051898002624512, + "step": 3490 + }, + { + "ce_loss": 0.18541871011257172, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "distill_loss": 0.09360478073358536, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "ref_ce_loss": 0.08527404069900513, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "loss": 0.6198468804359436, + "step": 3490 + }, + { + "ce_loss": 0.24808526039123535, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "distill_loss": 0.10804644227027893, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "ref_ce_loss": 0.1124085932970047, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "loss": 0.928071141242981, + "step": 3490 + }, + { + "ce_loss": 0.26359590888023376, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "distill_loss": 0.13496239483356476, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "ref_ce_loss": 0.12686119973659515, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "loss": 0.7552123665809631, + "step": 3490 + }, + { + "ce_loss": 0.23390626907348633, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "distill_loss": 0.09438870847225189, + "epoch": 1.1641094062708472, + "step": 3490 + }, + { + "epoch": 1.1641094062708472, + "ref_ce_loss": 0.14667364954948425, + "step": 3490 + }, + { + "epoch": 1.1674449633088726, + "loss": 0.6156, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "grad_norm": 2.5168838500976562, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "learning_rate": 0.0002895533945942166, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "loss": 0.45355695486068726, + "step": 3500 + }, + { + "ce_loss": 0.18383370339870453, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "distill_loss": 0.09112045913934708, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "ref_ce_loss": 0.1282046139240265, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "loss": 0.5794343948364258, + "step": 3500 + }, + { + "ce_loss": 0.26721489429473877, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "distill_loss": 0.11259608715772629, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "ref_ce_loss": 0.19929681718349457, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "loss": 1.3107562065124512, + "step": 3500 + }, + { + "ce_loss": 0.22610430419445038, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "distill_loss": 0.09464081376791, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "ref_ce_loss": 0.1893109828233719, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "loss": 0.7660556435585022, + "step": 3500 + }, + { + "ce_loss": 0.2392236292362213, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "distill_loss": 0.09696606546640396, + "epoch": 1.1674449633088726, + "step": 3500 + }, + { + "epoch": 1.1674449633088726, + "ref_ce_loss": 0.17173665761947632, + "step": 3500 + }, + { + "epoch": 1.170780520346898, + "loss": 0.641, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "grad_norm": 2.7460596561431885, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "learning_rate": 0.000289478996690077, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "loss": 0.7444456815719604, + "step": 3510 + }, + { + "ce_loss": 0.2830856442451477, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "distill_loss": 0.10403694212436676, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "ref_ce_loss": 0.12975654006004333, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "loss": 0.65533047914505, + "step": 3510 + }, + { + "ce_loss": 0.21140718460083008, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "distill_loss": 0.09631803631782532, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "ref_ce_loss": 0.13436973094940186, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "loss": 1.00142240524292, + "step": 3510 + }, + { + "ce_loss": 0.24279353022575378, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "distill_loss": 0.10590619593858719, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "ref_ce_loss": 0.13143108785152435, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "loss": 0.5753421783447266, + "step": 3510 + }, + { + "ce_loss": 0.21339942514896393, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "distill_loss": 0.0863531157374382, + "epoch": 1.170780520346898, + "step": 3510 + }, + { + "epoch": 1.170780520346898, + "ref_ce_loss": 0.17893031239509583, + "step": 3510 + }, + { + "epoch": 1.1741160773849233, + "loss": 0.619, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "grad_norm": 4.942493438720703, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "learning_rate": 0.00028940434443126736, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "loss": 0.44487467408180237, + "step": 3520 + }, + { + "ce_loss": 0.19608616828918457, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "distill_loss": 0.10286536067724228, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "ref_ce_loss": 0.0929742306470871, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "loss": 1.1735808849334717, + "step": 3520 + }, + { + "ce_loss": 0.1759229153394699, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "distill_loss": 0.11952294409275055, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "ref_ce_loss": 0.20624127984046936, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "loss": 0.6996158957481384, + "step": 3520 + }, + { + "ce_loss": 0.1959640234708786, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "distill_loss": 0.0844564139842987, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "ref_ce_loss": 0.08976216614246368, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "loss": 0.4972955286502838, + "step": 3520 + }, + { + "ce_loss": 0.22896063327789307, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "distill_loss": 0.12463532388210297, + "epoch": 1.1741160773849233, + "step": 3520 + }, + { + "epoch": 1.1741160773849233, + "ref_ce_loss": 0.1436271220445633, + "step": 3520 + }, + { + "epoch": 1.1774516344229486, + "loss": 0.7453, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "grad_norm": 3.2423369884490967, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "learning_rate": 0.00028932943795392406, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "loss": 0.9250675439834595, + "step": 3530 + }, + { + "ce_loss": 0.2386307269334793, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "distill_loss": 0.24092139303684235, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "ref_ce_loss": 0.1675252616405487, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "loss": 0.5805863738059998, + "step": 3530 + }, + { + "ce_loss": 0.231970876455307, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "distill_loss": 0.23430107533931732, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "ref_ce_loss": 0.11357176303863525, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "loss": 0.8791962265968323, + "step": 3530 + }, + { + "ce_loss": 0.26668962836265564, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "distill_loss": 0.26670536398887634, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "ref_ce_loss": 0.15831813216209412, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "loss": 0.5398500561714172, + "step": 3530 + }, + { + "ce_loss": 0.20641468465328217, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "distill_loss": 0.1799340546131134, + "epoch": 1.1774516344229486, + "step": 3530 + }, + { + "epoch": 1.1774516344229486, + "ref_ce_loss": 0.1532498002052307, + "step": 3530 + }, + { + "epoch": 1.180787191460974, + "loss": 0.717, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "grad_norm": 2.5336835384368896, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "learning_rate": 0.0002892542773946468, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "loss": 0.8210991621017456, + "step": 3540 + }, + { + "ce_loss": 0.19714991748332977, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "distill_loss": 0.20616674423217773, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "ref_ce_loss": 0.13417214155197144, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "loss": 1.0714623928070068, + "step": 3540 + }, + { + "ce_loss": 0.37166714668273926, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "distill_loss": 0.2562217712402344, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "ref_ce_loss": 0.17911586165428162, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "loss": 0.5534677505493164, + "step": 3540 + }, + { + "ce_loss": 0.19591966271400452, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "distill_loss": 0.17593654990196228, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "ref_ce_loss": 0.18121853470802307, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "loss": 0.8016934394836426, + "step": 3540 + }, + { + "ce_loss": 0.3819798231124878, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "distill_loss": 0.22713986039161682, + "epoch": 1.180787191460974, + "step": 3540 + }, + { + "epoch": 1.180787191460974, + "ref_ce_loss": 0.19177654385566711, + "step": 3540 + }, + { + "epoch": 1.1841227484989993, + "loss": 0.7635, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "grad_norm": 5.0213165283203125, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "learning_rate": 0.00028917886289049903, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "loss": 0.651298463344574, + "step": 3550 + }, + { + "ce_loss": 0.19940482079982758, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "distill_loss": 0.23429922759532928, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "ref_ce_loss": 0.13923677802085876, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "loss": 0.7713093161582947, + "step": 3550 + }, + { + "ce_loss": 0.30344629287719727, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "distill_loss": 0.22498206794261932, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "ref_ce_loss": 0.18815527856349945, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "loss": 0.6081314086914062, + "step": 3550 + }, + { + "ce_loss": 0.2188054770231247, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "distill_loss": 0.18806442618370056, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "ref_ce_loss": 0.134329155087471, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "loss": 0.7301296591758728, + "step": 3550 + }, + { + "ce_loss": 0.284285306930542, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "distill_loss": 0.27022460103034973, + "epoch": 1.1841227484989993, + "step": 3550 + }, + { + "epoch": 1.1841227484989993, + "ref_ce_loss": 0.17183545231819153, + "step": 3550 + }, + { + "epoch": 1.1874583055370247, + "loss": 0.779, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "grad_norm": 2.13301682472229, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "learning_rate": 0.00028910319457900685, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "loss": 0.5808578133583069, + "step": 3560 + }, + { + "ce_loss": 0.1893155872821808, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "distill_loss": 0.14360655844211578, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "ref_ce_loss": 0.0961495190858841, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "loss": 0.42147135734558105, + "step": 3560 + }, + { + "ce_loss": 0.17312490940093994, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "distill_loss": 0.133303701877594, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "ref_ce_loss": 0.114800825715065, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "loss": 0.6292843818664551, + "step": 3560 + }, + { + "ce_loss": 0.2685680687427521, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "distill_loss": 0.17225492000579834, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "ref_ce_loss": 0.12113097310066223, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "loss": 1.1248754262924194, + "step": 3560 + }, + { + "ce_loss": 0.24754074215888977, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "distill_loss": 0.16874170303344727, + "epoch": 1.1874583055370247, + "step": 3560 + }, + { + "epoch": 1.1874583055370247, + "ref_ce_loss": 0.23063063621520996, + "step": 3560 + }, + { + "epoch": 1.19079386257505, + "loss": 0.6905, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "grad_norm": 5.3514580726623535, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "learning_rate": 0.00028902727259815956, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "loss": 0.6627309322357178, + "step": 3570 + }, + { + "ce_loss": 0.31900355219841003, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "distill_loss": 0.12173722684383392, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "ref_ce_loss": 0.1653222143650055, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "loss": 0.687881350517273, + "step": 3570 + }, + { + "ce_loss": 0.29725533723831177, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "distill_loss": 0.14126135408878326, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "ref_ce_loss": 0.19328750669956207, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "loss": 0.44334492087364197, + "step": 3570 + }, + { + "ce_loss": 0.1040029302239418, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "distill_loss": 0.09804385900497437, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "ref_ce_loss": 0.09700576961040497, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "loss": 0.797677755355835, + "step": 3570 + }, + { + "ce_loss": 0.16014978289604187, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "distill_loss": 0.11923857778310776, + "epoch": 1.19079386257505, + "step": 3570 + }, + { + "epoch": 1.19079386257505, + "ref_ce_loss": 0.08255496621131897, + "step": 3570 + }, + { + "epoch": 1.1941294196130754, + "loss": 0.6399, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "grad_norm": 2.4868955612182617, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "learning_rate": 0.00028895109708640876, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "loss": 0.7460382580757141, + "step": 3580 + }, + { + "ce_loss": 0.16989870369434357, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "distill_loss": 0.09856413304805756, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "ref_ce_loss": 0.09345860034227371, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "loss": 0.9039356112480164, + "step": 3580 + }, + { + "ce_loss": 0.35457083582878113, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "distill_loss": 0.15812674164772034, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "ref_ce_loss": 0.18375664949417114, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "loss": 0.6958930492401123, + "step": 3580 + }, + { + "ce_loss": 0.24443304538726807, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "distill_loss": 0.11708992719650269, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "ref_ce_loss": 0.14090929925441742, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "loss": 0.7182627320289612, + "step": 3580 + }, + { + "ce_loss": 0.20644690096378326, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "distill_loss": 0.12384609133005142, + "epoch": 1.1941294196130754, + "step": 3580 + }, + { + "epoch": 1.1941294196130754, + "ref_ce_loss": 0.1659153550863266, + "step": 3580 + }, + { + "epoch": 1.1974649766511007, + "loss": 0.6628, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "grad_norm": 29.785736083984375, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "learning_rate": 0.00028887466818266865, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "loss": 0.4725848436355591, + "step": 3590 + }, + { + "ce_loss": 0.25671958923339844, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "distill_loss": 0.08832456171512604, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "ref_ce_loss": 0.1264975517988205, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "loss": 1.0542960166931152, + "step": 3590 + }, + { + "ce_loss": 0.29440543055534363, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "distill_loss": 0.11251768469810486, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "ref_ce_loss": 0.20507730543613434, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "loss": 0.5612708330154419, + "step": 3590 + }, + { + "ce_loss": 0.13027560710906982, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "distill_loss": 0.08713222295045853, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "ref_ce_loss": 0.09830352663993835, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "loss": 0.5265268683433533, + "step": 3590 + }, + { + "ce_loss": 0.22897528111934662, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "distill_loss": 0.10633121430873871, + "epoch": 1.1974649766511007, + "step": 3590 + }, + { + "epoch": 1.1974649766511007, + "ref_ce_loss": 0.12552028894424438, + "step": 3590 + }, + { + "epoch": 1.200800533689126, + "loss": 0.6444, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "grad_norm": 2.3796160221099854, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "learning_rate": 0.00028879798602631537, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "loss": 0.6752573847770691, + "step": 3600 + }, + { + "ce_loss": 0.22611364722251892, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "distill_loss": 0.11424873024225235, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "ref_ce_loss": 0.15618790686130524, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "loss": 0.7464491128921509, + "step": 3600 + }, + { + "ce_loss": 0.21239803731441498, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "distill_loss": 0.10173000395298004, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "ref_ce_loss": 0.11320611089468002, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "loss": 0.739050030708313, + "step": 3600 + }, + { + "ce_loss": 0.23679211735725403, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "distill_loss": 0.11317134648561478, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "ref_ce_loss": 0.14366133511066437, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "loss": 0.4201709032058716, + "step": 3600 + }, + { + "ce_loss": 0.1703682392835617, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "distill_loss": 0.12370355427265167, + "epoch": 1.200800533689126, + "step": 3600 + }, + { + "epoch": 1.200800533689126, + "ref_ce_loss": 0.1257946789264679, + "step": 3600 + }, + { + "epoch": 1.2041360907271514, + "loss": 0.6474, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "grad_norm": 13.404465675354004, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "learning_rate": 0.0002887210507571869, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "loss": 0.49275848269462585, + "step": 3610 + }, + { + "ce_loss": 0.24199196696281433, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "distill_loss": 0.09797633439302444, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "ref_ce_loss": 0.1525280922651291, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "loss": 0.5640525221824646, + "step": 3610 + }, + { + "ce_loss": 0.19955472648143768, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "distill_loss": 0.11258351802825928, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "ref_ce_loss": 0.14966405928134918, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "loss": 0.48773708939552307, + "step": 3610 + }, + { + "ce_loss": 0.2079242616891861, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "distill_loss": 0.12160438299179077, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "ref_ce_loss": 0.15779396891593933, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "loss": 0.6037769317626953, + "step": 3610 + }, + { + "ce_loss": 0.28633320331573486, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "distill_loss": 0.12164897471666336, + "epoch": 1.2041360907271514, + "step": 3610 + }, + { + "epoch": 1.2041360907271514, + "ref_ce_loss": 0.1462378203868866, + "step": 3610 + }, + { + "epoch": 1.2074716477651768, + "loss": 0.6433, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "grad_norm": 2.566305160522461, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "learning_rate": 0.0002886438625155828, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "loss": 0.9501935243606567, + "step": 3620 + }, + { + "ce_loss": 0.23689863085746765, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "distill_loss": 0.14002355933189392, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "ref_ce_loss": 0.12740619480609894, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "loss": 0.5629335641860962, + "step": 3620 + }, + { + "ce_loss": 0.16847597062587738, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "distill_loss": 0.0989687517285347, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "ref_ce_loss": 0.18997547030448914, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "loss": 0.47026073932647705, + "step": 3620 + }, + { + "ce_loss": 0.21434803307056427, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "distill_loss": 0.10124547779560089, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "ref_ce_loss": 0.15439580380916595, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "loss": 0.9181081652641296, + "step": 3620 + }, + { + "ce_loss": 0.34848782420158386, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "distill_loss": 0.12178175896406174, + "epoch": 1.2074716477651768, + "step": 3620 + }, + { + "epoch": 1.2074716477651768, + "ref_ce_loss": 0.20156177878379822, + "step": 3620 + }, + { + "epoch": 1.2108072048032021, + "loss": 0.5965, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "grad_norm": 3.8586368560791016, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "learning_rate": 0.0002885664214422641, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "loss": 0.8446038961410522, + "step": 3630 + }, + { + "ce_loss": 0.2831612229347229, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "distill_loss": 0.117464579641819, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "ref_ce_loss": 0.15272650122642517, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "loss": 0.6270732283592224, + "step": 3630 + }, + { + "ce_loss": 0.16092394292354584, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "distill_loss": 0.07629981637001038, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "ref_ce_loss": 0.19512324035167694, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "loss": 1.1022748947143555, + "step": 3630 + }, + { + "ce_loss": 0.24730761349201202, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "distill_loss": 0.11945497244596481, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "ref_ce_loss": 0.13521139323711395, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "loss": 0.43340691924095154, + "step": 3630 + }, + { + "ce_loss": 0.1800413727760315, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "distill_loss": 0.0931423231959343, + "epoch": 1.2108072048032021, + "step": 3630 + }, + { + "epoch": 1.2108072048032021, + "ref_ce_loss": 0.15474474430084229, + "step": 3630 + }, + { + "epoch": 1.2141427618412275, + "loss": 0.5921, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "grad_norm": 3.449916362762451, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "learning_rate": 0.0002884887276784526, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "loss": 0.4230595827102661, + "step": 3640 + }, + { + "ce_loss": 0.16139523684978485, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "distill_loss": 0.088740773499012, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "ref_ce_loss": 0.11849240958690643, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "loss": 1.0855642557144165, + "step": 3640 + }, + { + "ce_loss": 0.2549758851528168, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "distill_loss": 0.13948114216327667, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "ref_ce_loss": 0.12895409762859344, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "loss": 0.6072877645492554, + "step": 3640 + }, + { + "ce_loss": 0.19314055144786835, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "distill_loss": 0.12987002730369568, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "ref_ce_loss": 0.2171475738286972, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "loss": 0.3850543797016144, + "step": 3640 + }, + { + "ce_loss": 0.15985210239887238, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "distill_loss": 0.10375294089317322, + "epoch": 1.2141427618412275, + "step": 3640 + }, + { + "epoch": 1.2141427618412275, + "ref_ce_loss": 0.12134460359811783, + "step": 3640 + }, + { + "epoch": 1.2174783188792528, + "loss": 0.6544, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "grad_norm": 2.291261911392212, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "learning_rate": 0.0002884107813658312, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "loss": 0.8231579065322876, + "step": 3650 + }, + { + "ce_loss": 0.3279842436313629, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "distill_loss": 0.10889627039432526, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "ref_ce_loss": 0.2368888109922409, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "loss": 0.570781946182251, + "step": 3650 + }, + { + "ce_loss": 0.2116650938987732, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "distill_loss": 0.12157993018627167, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "ref_ce_loss": 0.15611954033374786, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "loss": 0.48231443762779236, + "step": 3650 + }, + { + "ce_loss": 0.18459239602088928, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "distill_loss": 0.0977061539888382, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "ref_ce_loss": 0.15423721075057983, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "loss": 0.4680092930793762, + "step": 3650 + }, + { + "ce_loss": 0.1793879270553589, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "distill_loss": 0.1235269084572792, + "epoch": 1.2174783188792528, + "step": 3650 + }, + { + "epoch": 1.2174783188792528, + "ref_ce_loss": 0.0962749794125557, + "step": 3650 + }, + { + "epoch": 1.2208138759172782, + "loss": 0.7043, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "grad_norm": 6.269075393676758, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "learning_rate": 0.0002883325826465432, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "loss": 0.5581228137016296, + "step": 3660 + }, + { + "ce_loss": 0.2508165240287781, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "distill_loss": 0.09761396795511246, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "ref_ce_loss": 0.12982720136642456, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "loss": 0.6540179252624512, + "step": 3660 + }, + { + "ce_loss": 0.22440961003303528, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "distill_loss": 0.1306951344013214, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "ref_ce_loss": 0.14553359150886536, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "loss": 1.250847339630127, + "step": 3660 + }, + { + "ce_loss": 0.27133795619010925, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "distill_loss": 0.12290221452713013, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "ref_ce_loss": 0.12761349976062775, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "loss": 0.43164223432540894, + "step": 3660 + }, + { + "ce_loss": 0.13554264605045319, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "distill_loss": 0.10463345795869827, + "epoch": 1.2208138759172782, + "step": 3660 + }, + { + "epoch": 1.2208138759172782, + "ref_ce_loss": 0.10604510456323624, + "step": 3660 + }, + { + "epoch": 1.2241494329553035, + "loss": 0.697, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "grad_norm": 3.5499510765075684, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "learning_rate": 0.00028825413166319217, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "loss": 0.5550094246864319, + "step": 3670 + }, + { + "ce_loss": 0.15072378516197205, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "distill_loss": 0.08958537876605988, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "ref_ce_loss": 0.11560764163732529, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "loss": 0.6072015762329102, + "step": 3670 + }, + { + "ce_loss": 0.30362680554389954, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "distill_loss": 0.11939045041799545, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "ref_ce_loss": 0.1375572234392166, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "loss": 0.503847599029541, + "step": 3670 + }, + { + "ce_loss": 0.2185611128807068, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "distill_loss": 0.10109249502420425, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "ref_ce_loss": 0.13138023018836975, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "loss": 0.6444181203842163, + "step": 3670 + }, + { + "ce_loss": 0.2963375449180603, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "distill_loss": 0.17675013840198517, + "epoch": 1.2241494329553035, + "step": 3670 + }, + { + "epoch": 1.2241494329553035, + "ref_ce_loss": 0.17069223523139954, + "step": 3670 + }, + { + "epoch": 1.227484989993329, + "loss": 0.6122, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "grad_norm": 1.8238017559051514, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "learning_rate": 0.0002881754285588418, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "loss": 0.610914409160614, + "step": 3680 + }, + { + "ce_loss": 0.16786062717437744, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "distill_loss": 0.09828974306583405, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "ref_ce_loss": 0.13559795916080475, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "loss": 1.0151851177215576, + "step": 3680 + }, + { + "ce_loss": 0.2395806908607483, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "distill_loss": 0.11997424811124802, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "ref_ce_loss": 0.1494312435388565, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "loss": 0.7240346074104309, + "step": 3680 + }, + { + "ce_loss": 0.352152019739151, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "distill_loss": 0.1361006647348404, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "ref_ce_loss": 0.17786292731761932, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "loss": 0.5180095434188843, + "step": 3680 + }, + { + "ce_loss": 0.2050534188747406, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "distill_loss": 0.11590767651796341, + "epoch": 1.227484989993329, + "step": 3680 + }, + { + "epoch": 1.227484989993329, + "ref_ce_loss": 0.1965571939945221, + "step": 3680 + }, + { + "epoch": 1.2308205470313542, + "loss": 0.6667, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "grad_norm": 3.9641623497009277, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "learning_rate": 0.00028809647347701546, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "loss": 1.1504862308502197, + "step": 3690 + }, + { + "ce_loss": 0.2964133024215698, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "distill_loss": 0.10304144769906998, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "ref_ce_loss": 0.18209707736968994, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "loss": 0.8985886573791504, + "step": 3690 + }, + { + "ce_loss": 0.29773250222206116, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "distill_loss": 0.11341776698827744, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "ref_ce_loss": 0.16153104603290558, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "loss": 0.42868906259536743, + "step": 3690 + }, + { + "ce_loss": 0.14353783428668976, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "distill_loss": 0.09262649714946747, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "ref_ce_loss": 0.1247769147157669, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "loss": 0.47106653451919556, + "step": 3690 + }, + { + "ce_loss": 0.2131417840719223, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "distill_loss": 0.10240469127893448, + "epoch": 1.2308205470313542, + "step": 3690 + }, + { + "epoch": 1.2308205470313542, + "ref_ce_loss": 0.08614566177129745, + "step": 3690 + }, + { + "epoch": 1.2341561040693796, + "loss": 0.6576, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "grad_norm": 2.76326584815979, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "learning_rate": 0.00028801726656169617, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "loss": 0.9501110315322876, + "step": 3700 + }, + { + "ce_loss": 0.25042182207107544, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "distill_loss": 0.11113158613443375, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "ref_ce_loss": 0.18582938611507416, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "loss": 0.3989907503128052, + "step": 3700 + }, + { + "ce_loss": 0.1486864984035492, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "distill_loss": 0.12427866458892822, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "ref_ce_loss": 0.08574719727039337, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "loss": 0.5246773958206177, + "step": 3700 + }, + { + "ce_loss": 0.18894904851913452, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "distill_loss": 0.12698686122894287, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "ref_ce_loss": 0.16238951683044434, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "loss": 0.48624590039253235, + "step": 3700 + }, + { + "ce_loss": 0.15266580879688263, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "distill_loss": 0.11565395444631577, + "epoch": 1.2341561040693796, + "step": 3700 + }, + { + "epoch": 1.2341561040693796, + "ref_ce_loss": 0.12820059061050415, + "step": 3700 + }, + { + "epoch": 1.237491661107405, + "loss": 0.6881, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "grad_norm": 2.4743893146514893, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "learning_rate": 0.00028793780795732603, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "loss": 0.47350871562957764, + "step": 3710 + }, + { + "ce_loss": 0.2040337473154068, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "distill_loss": 0.14777837693691254, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "ref_ce_loss": 0.1179155632853508, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "loss": 0.4797500967979431, + "step": 3710 + }, + { + "ce_loss": 0.1412224918603897, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "distill_loss": 0.13309955596923828, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "ref_ce_loss": 0.15007424354553223, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "loss": 0.6380416750907898, + "step": 3710 + }, + { + "ce_loss": 0.11058314144611359, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "distill_loss": 0.12416449189186096, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "ref_ce_loss": 0.13631749153137207, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "loss": 0.473675012588501, + "step": 3710 + }, + { + "ce_loss": 0.17507950961589813, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "distill_loss": 0.1544097363948822, + "epoch": 1.237491661107405, + "step": 3710 + }, + { + "epoch": 1.237491661107405, + "ref_ce_loss": 0.14373698830604553, + "step": 3710 + }, + { + "epoch": 1.2408272181454303, + "loss": 0.6932, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "grad_norm": 2.810713768005371, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "learning_rate": 0.0002878580978088062, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "loss": 0.4802871644496918, + "step": 3720 + }, + { + "ce_loss": 0.19279566407203674, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "distill_loss": 0.1619156002998352, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "ref_ce_loss": 0.12544824182987213, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "loss": 0.7181475162506104, + "step": 3720 + }, + { + "ce_loss": 0.16339702904224396, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "distill_loss": 0.13173145055770874, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "ref_ce_loss": 0.1465282142162323, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "loss": 0.46347326040267944, + "step": 3720 + }, + { + "ce_loss": 0.1841423511505127, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "distill_loss": 0.14019352197647095, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "ref_ce_loss": 0.13906390964984894, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "loss": 0.7790138721466064, + "step": 3720 + }, + { + "ce_loss": 0.2417084127664566, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "distill_loss": 0.18298661708831787, + "epoch": 1.2408272181454303, + "step": 3720 + }, + { + "epoch": 1.2408272181454303, + "ref_ce_loss": 0.17605675756931305, + "step": 3720 + }, + { + "epoch": 1.2441627751834556, + "loss": 0.6738, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "grad_norm": 2.822309732437134, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "learning_rate": 0.00028777813626149653, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "loss": 0.7054616212844849, + "step": 3730 + }, + { + "ce_loss": 0.26692935824394226, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "distill_loss": 0.17259365320205688, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "ref_ce_loss": 0.13720785081386566, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "loss": 0.5728940963745117, + "step": 3730 + }, + { + "ce_loss": 0.24504394829273224, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "distill_loss": 0.15896917879581451, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "ref_ce_loss": 0.16876274347305298, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "loss": 0.5301774144172668, + "step": 3730 + }, + { + "ce_loss": 0.2005605250597, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "distill_loss": 0.1347273737192154, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "ref_ce_loss": 0.13233374059200287, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "loss": 0.832635760307312, + "step": 3730 + }, + { + "ce_loss": 0.285343736410141, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "distill_loss": 0.1432167887687683, + "epoch": 1.2441627751834556, + "step": 3730 + }, + { + "epoch": 1.2441627751834556, + "ref_ce_loss": 0.14712287485599518, + "step": 3730 + }, + { + "epoch": 1.247498332221481, + "loss": 0.6895, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "grad_norm": 2.4720518589019775, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "learning_rate": 0.0002876979234612153, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "loss": 0.4753130078315735, + "step": 3740 + }, + { + "ce_loss": 0.1841742992401123, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "distill_loss": 0.13408738374710083, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "ref_ce_loss": 0.15682215988636017, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "loss": 0.5171338319778442, + "step": 3740 + }, + { + "ce_loss": 0.26918095350265503, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "distill_loss": 0.13590781390666962, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "ref_ce_loss": 0.11171242594718933, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "loss": 0.34572017192840576, + "step": 3740 + }, + { + "ce_loss": 0.12438411265611649, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "distill_loss": 0.10231960564851761, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "ref_ce_loss": 0.11600019037723541, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "loss": 0.4994368553161621, + "step": 3740 + }, + { + "ce_loss": 0.15896347165107727, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "distill_loss": 0.1320989727973938, + "epoch": 1.247498332221481, + "step": 3740 + }, + { + "epoch": 1.247498332221481, + "ref_ce_loss": 0.1545882225036621, + "step": 3740 + }, + { + "epoch": 1.2508338892595063, + "loss": 0.6165, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "grad_norm": 2.6057846546173096, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "learning_rate": 0.00028761745955423917, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "loss": 0.9892425537109375, + "step": 3750 + }, + { + "ce_loss": 0.2851608395576477, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "distill_loss": 0.12828153371810913, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "ref_ce_loss": 0.17768558859825134, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "loss": 1.2168772220611572, + "step": 3750 + }, + { + "ce_loss": 0.2085263729095459, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "distill_loss": 0.10808371007442474, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "ref_ce_loss": 0.12219865620136261, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "loss": 0.38408538699150085, + "step": 3750 + }, + { + "ce_loss": 0.17760694026947021, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "distill_loss": 0.11270315945148468, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "ref_ce_loss": 0.0935731828212738, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "loss": 0.8690676093101501, + "step": 3750 + }, + { + "ce_loss": 0.3034827411174774, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "distill_loss": 0.12080539017915726, + "epoch": 1.2508338892595063, + "step": 3750 + }, + { + "epoch": 1.2508338892595063, + "ref_ce_loss": 0.2369447648525238, + "step": 3750 + }, + { + "epoch": 1.2541694462975317, + "loss": 0.6438, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "grad_norm": 2.890160083770752, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "learning_rate": 0.00028753674468730246, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "loss": 0.5276728868484497, + "step": 3760 + }, + { + "ce_loss": 0.16678202152252197, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "distill_loss": 0.08398936688899994, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "ref_ce_loss": 0.14039310812950134, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "loss": 0.43622809648513794, + "step": 3760 + }, + { + "ce_loss": 0.15578439831733704, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "distill_loss": 0.0966634452342987, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "ref_ce_loss": 0.1068943589925766, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "loss": 0.7373557090759277, + "step": 3760 + }, + { + "ce_loss": 0.3678506910800934, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "distill_loss": 0.13448716700077057, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "ref_ce_loss": 0.16213582456111908, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "loss": 0.3833318054676056, + "step": 3760 + }, + { + "ce_loss": 0.10126097500324249, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "distill_loss": 0.0806593969464302, + "epoch": 1.2541694462975317, + "step": 3760 + }, + { + "epoch": 1.2541694462975317, + "ref_ce_loss": 0.10641855001449585, + "step": 3760 + }, + { + "epoch": 1.257505003335557, + "loss": 0.6276, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "grad_norm": 3.256934404373169, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "learning_rate": 0.00028745577900759724, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "loss": 1.00767183303833, + "step": 3770 + }, + { + "ce_loss": 0.2344723790884018, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "distill_loss": 0.157794788479805, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "ref_ce_loss": 0.18107791244983673, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "loss": 0.5176934599876404, + "step": 3770 + }, + { + "ce_loss": 0.21863307058811188, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "distill_loss": 0.14407603442668915, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "ref_ce_loss": 0.08990081399679184, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "loss": 0.43721339106559753, + "step": 3770 + }, + { + "ce_loss": 0.17311523854732513, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "distill_loss": 0.09926941245794296, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "ref_ce_loss": 0.16452080011367798, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "loss": 0.7104783058166504, + "step": 3770 + }, + { + "ce_loss": 0.18649962544441223, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "distill_loss": 0.1250247359275818, + "epoch": 1.257505003335557, + "step": 3770 + }, + { + "epoch": 1.257505003335557, + "ref_ce_loss": 0.1523253321647644, + "step": 3770 + }, + { + "epoch": 1.2608405603735824, + "loss": 0.663, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "grad_norm": 2.457021713256836, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "learning_rate": 0.000287374562662773, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "loss": 0.520882785320282, + "step": 3780 + }, + { + "ce_loss": 0.24153728783130646, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "distill_loss": 0.1284068077802658, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "ref_ce_loss": 0.15089614689350128, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "loss": 0.5422986745834351, + "step": 3780 + }, + { + "ce_loss": 0.15094715356826782, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "distill_loss": 0.10879409313201904, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "ref_ce_loss": 0.15774983167648315, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "loss": 0.5709801912307739, + "step": 3780 + }, + { + "ce_loss": 0.211181640625, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "distill_loss": 0.139474555850029, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "ref_ce_loss": 0.1433981955051422, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "loss": 0.6591182947158813, + "step": 3780 + }, + { + "ce_loss": 0.25575539469718933, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "distill_loss": 0.13369643688201904, + "epoch": 1.2608405603735824, + "step": 3780 + }, + { + "epoch": 1.2608405603735824, + "ref_ce_loss": 0.17987275123596191, + "step": 3780 + }, + { + "epoch": 1.2641761174116077, + "loss": 0.6722, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "grad_norm": 3.7740092277526855, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "learning_rate": 0.0002872930958009363, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "loss": 0.5643190145492554, + "step": 3790 + }, + { + "ce_loss": 0.2098139077425003, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "distill_loss": 0.11965808272361755, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "ref_ce_loss": 0.18449269235134125, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "loss": 0.638534665107727, + "step": 3790 + }, + { + "ce_loss": 0.3083104193210602, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "distill_loss": 0.148521289229393, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "ref_ce_loss": 0.18149085342884064, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "loss": 0.7080193758010864, + "step": 3790 + }, + { + "ce_loss": 0.20180247724056244, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "distill_loss": 0.13183492422103882, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "ref_ce_loss": 0.12661731243133545, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "loss": 0.7978255748748779, + "step": 3790 + }, + { + "ce_loss": 0.3292304575443268, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "distill_loss": 0.15548944473266602, + "epoch": 1.2641761174116077, + "step": 3790 + }, + { + "epoch": 1.2641761174116077, + "ref_ce_loss": 0.17226652801036835, + "step": 3790 + }, + { + "epoch": 1.267511674449633, + "loss": 0.6626, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "grad_norm": 2.5062668323516846, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "learning_rate": 0.0002872113785706506, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "loss": 0.6491740942001343, + "step": 3800 + }, + { + "ce_loss": 0.23980125784873962, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "distill_loss": 0.12023845314979553, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "ref_ce_loss": 0.12148669362068176, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "loss": 0.6998868584632874, + "step": 3800 + }, + { + "ce_loss": 0.24571934342384338, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "distill_loss": 0.13267642259597778, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "ref_ce_loss": 0.12511129677295685, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "loss": 0.3711976110935211, + "step": 3800 + }, + { + "ce_loss": 0.16206589341163635, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "distill_loss": 0.11592836678028107, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "ref_ce_loss": 0.0925321877002716, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "loss": 0.34810692071914673, + "step": 3800 + }, + { + "ce_loss": 0.14879584312438965, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "distill_loss": 0.10272634029388428, + "epoch": 1.267511674449633, + "step": 3800 + }, + { + "epoch": 1.267511674449633, + "ref_ce_loss": 0.09614356607198715, + "step": 3800 + }, + { + "epoch": 1.2708472314876584, + "loss": 0.6808, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "grad_norm": 2.2756903171539307, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "learning_rate": 0.0002871294111209358, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "loss": 0.6024037599563599, + "step": 3810 + }, + { + "ce_loss": 0.16057908535003662, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "distill_loss": 0.13251294195652008, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "ref_ce_loss": 0.14679312705993652, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "loss": 1.074276328086853, + "step": 3810 + }, + { + "ce_loss": 0.2896794378757477, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "distill_loss": 0.148069828748703, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "ref_ce_loss": 0.1660357266664505, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "loss": 0.6567397713661194, + "step": 3810 + }, + { + "ce_loss": 0.22404265403747559, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "distill_loss": 0.11648007482290268, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "ref_ce_loss": 0.1753883957862854, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "loss": 0.8125191330909729, + "step": 3810 + }, + { + "ce_loss": 0.32670843601226807, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "distill_loss": 0.1736506074666977, + "epoch": 1.2708472314876584, + "step": 3810 + }, + { + "epoch": 1.2708472314876584, + "ref_ce_loss": 0.147975355386734, + "step": 3810 + }, + { + "epoch": 1.2741827885256838, + "loss": 0.6292, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "grad_norm": 5.134918689727783, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "learning_rate": 0.0002870471936012683, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "loss": 0.9305306077003479, + "step": 3820 + }, + { + "ce_loss": 0.22662502527236938, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "distill_loss": 0.15132461488246918, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "ref_ce_loss": 0.1143263652920723, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "loss": 0.7782340049743652, + "step": 3820 + }, + { + "ce_loss": 0.2704147398471832, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "distill_loss": 0.1646704226732254, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "ref_ce_loss": 0.14147979021072388, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "loss": 0.6947014331817627, + "step": 3820 + }, + { + "ce_loss": 0.29456502199172974, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "distill_loss": 0.1552099734544754, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "ref_ce_loss": 0.15763463079929352, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "loss": 0.43103697896003723, + "step": 3820 + }, + { + "ce_loss": 0.13802245259284973, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "distill_loss": 0.1441994160413742, + "epoch": 1.2741827885256838, + "step": 3820 + }, + { + "epoch": 1.2741827885256838, + "ref_ce_loss": 0.14861568808555603, + "step": 3820 + }, + { + "epoch": 1.2775183455637091, + "loss": 0.7171, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "grad_norm": 2.4478530883789062, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "learning_rate": 0.0002869647261615803, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "loss": 0.46987560391426086, + "step": 3830 + }, + { + "ce_loss": 0.1733369082212448, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "distill_loss": 0.10979942977428436, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "ref_ce_loss": 0.15175464749336243, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "loss": 1.4397883415222168, + "step": 3830 + }, + { + "ce_loss": 0.4084470570087433, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "distill_loss": 0.1578616499900818, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "ref_ce_loss": 0.15876798331737518, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "loss": 0.6761387586593628, + "step": 3830 + }, + { + "ce_loss": 0.26621678471565247, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "distill_loss": 0.18939264118671417, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "ref_ce_loss": 0.13820181787014008, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "loss": 0.6198230981826782, + "step": 3830 + }, + { + "ce_loss": 0.2509312927722931, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "distill_loss": 0.1535424441099167, + "epoch": 1.2775183455637091, + "step": 3830 + }, + { + "epoch": 1.2775183455637091, + "ref_ce_loss": 0.1641911268234253, + "step": 3830 + }, + { + "epoch": 1.2808539026017345, + "loss": 0.6643, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "grad_norm": 2.382905960083008, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "learning_rate": 0.00028688200895226, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "loss": 0.6250182390213013, + "step": 3840 + }, + { + "ce_loss": 0.20252063870429993, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "distill_loss": 0.14306099712848663, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "ref_ce_loss": 0.1793094277381897, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "loss": 0.47006261348724365, + "step": 3840 + }, + { + "ce_loss": 0.15936064720153809, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "distill_loss": 0.11916495114564896, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "ref_ce_loss": 0.11021065711975098, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "loss": 0.47605329751968384, + "step": 3840 + }, + { + "ce_loss": 0.17115725576877594, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "distill_loss": 0.13069213926792145, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "ref_ce_loss": 0.10102272033691406, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "loss": 0.6017922163009644, + "step": 3840 + }, + { + "ce_loss": 0.19884739816188812, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "distill_loss": 0.24015085399150848, + "epoch": 1.2808539026017345, + "step": 3840 + }, + { + "epoch": 1.2808539026017345, + "ref_ce_loss": 0.08779125660657883, + "step": 3840 + }, + { + "epoch": 1.2841894596397598, + "loss": 0.6084, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "grad_norm": 2.060270071029663, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "learning_rate": 0.00028679904212415097, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "loss": 0.5458661913871765, + "step": 3850 + }, + { + "ce_loss": 0.2180897742509842, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "distill_loss": 0.10629566013813019, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "ref_ce_loss": 0.15160155296325684, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "loss": 0.8067355751991272, + "step": 3850 + }, + { + "ce_loss": 0.39867109060287476, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "distill_loss": 0.14299823343753815, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "ref_ce_loss": 0.2649121880531311, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "loss": 0.697068989276886, + "step": 3850 + }, + { + "ce_loss": 0.19222131371498108, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "distill_loss": 0.11827998608350754, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "ref_ce_loss": 0.15390853583812714, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "loss": 0.47433117032051086, + "step": 3850 + }, + { + "ce_loss": 0.1541450470685959, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "distill_loss": 0.10531582683324814, + "epoch": 1.2841894596397598, + "step": 3850 + }, + { + "epoch": 1.2841894596397598, + "ref_ce_loss": 0.13863231241703033, + "step": 3850 + }, + { + "epoch": 1.2875250166777852, + "loss": 0.6328, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "grad_norm": 2.1689889430999756, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "learning_rate": 0.00028671582582855186, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "loss": 0.5351623296737671, + "step": 3860 + }, + { + "ce_loss": 0.11756846308708191, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "distill_loss": 0.1291414052248001, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "ref_ce_loss": 0.10097652673721313, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "loss": 0.988185703754425, + "step": 3860 + }, + { + "ce_loss": 0.2319861650466919, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "distill_loss": 0.1574300080537796, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "ref_ce_loss": 0.18644386529922485, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "loss": 0.729952335357666, + "step": 3860 + }, + { + "ce_loss": 0.30251288414001465, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "distill_loss": 0.16945448517799377, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "ref_ce_loss": 0.2575605809688568, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "loss": 0.4808621108531952, + "step": 3860 + }, + { + "ce_loss": 0.17800524830818176, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "distill_loss": 0.14971697330474854, + "epoch": 1.2875250166777852, + "step": 3860 + }, + { + "epoch": 1.2875250166777852, + "ref_ce_loss": 0.10496334731578827, + "step": 3860 + }, + { + "epoch": 1.2908605737158105, + "loss": 0.6641, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "grad_norm": 2.5105247497558594, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "learning_rate": 0.00028663236021721645, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "loss": 0.4470581114292145, + "step": 3870 + }, + { + "ce_loss": 0.18677105009555817, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "distill_loss": 0.08762294054031372, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "ref_ce_loss": 0.1723141223192215, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "loss": 0.6644189357757568, + "step": 3870 + }, + { + "ce_loss": 0.35353413224220276, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "distill_loss": 0.14174126088619232, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "ref_ce_loss": 0.1683140993118286, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "loss": 0.5526446104049683, + "step": 3870 + }, + { + "ce_loss": 0.16454748809337616, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "distill_loss": 0.11832288652658463, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "ref_ce_loss": 0.13812904059886932, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "loss": 1.190387487411499, + "step": 3870 + }, + { + "ce_loss": 0.24219068884849548, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "distill_loss": 0.12967538833618164, + "epoch": 1.2908605737158105, + "step": 3870 + }, + { + "epoch": 1.2908605737158105, + "ref_ce_loss": 0.19826941192150116, + "step": 3870 + }, + { + "epoch": 1.2941961307538359, + "loss": 0.6134, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "grad_norm": 2.6475019454956055, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "learning_rate": 0.00028654864544235307, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "loss": 0.4416970908641815, + "step": 3880 + }, + { + "ce_loss": 0.1246199682354927, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "distill_loss": 0.12249705195426941, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "ref_ce_loss": 0.12978720664978027, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "loss": 0.48672401905059814, + "step": 3880 + }, + { + "ce_loss": 0.16806533932685852, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "distill_loss": 0.13379566371440887, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "ref_ce_loss": 0.1297888159751892, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "loss": 0.7681386470794678, + "step": 3880 + }, + { + "ce_loss": 0.1779935210943222, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "distill_loss": 0.18026070296764374, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "ref_ce_loss": 0.12299671024084091, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "loss": 1.157135009765625, + "step": 3880 + }, + { + "ce_loss": 0.31688815355300903, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "distill_loss": 0.1663898229598999, + "epoch": 1.2941961307538359, + "step": 3880 + }, + { + "epoch": 1.2941961307538359, + "ref_ce_loss": 0.14574475586414337, + "step": 3880 + }, + { + "epoch": 1.2975316877918612, + "loss": 0.6617, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "grad_norm": 2.3238894939422607, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "learning_rate": 0.00028646468165662443, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "loss": 0.45971766114234924, + "step": 3890 + }, + { + "ce_loss": 0.20718860626220703, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "distill_loss": 0.10672736167907715, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "ref_ce_loss": 0.14458438754081726, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "loss": 0.4675193130970001, + "step": 3890 + }, + { + "ce_loss": 0.17066477239131927, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "distill_loss": 0.10979650914669037, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "ref_ce_loss": 0.13548187911510468, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "loss": 0.8037819862365723, + "step": 3890 + }, + { + "ce_loss": 0.26504796743392944, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "distill_loss": 0.14809170365333557, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "ref_ce_loss": 0.16031919419765472, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "loss": 0.4144549071788788, + "step": 3890 + }, + { + "ce_loss": 0.18542304635047913, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "distill_loss": 0.11288897693157196, + "epoch": 1.2975316877918612, + "step": 3890 + }, + { + "epoch": 1.2975316877918612, + "ref_ce_loss": 0.11544924974441528, + "step": 3890 + }, + { + "epoch": 1.3008672448298866, + "loss": 0.6172, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "grad_norm": 2.562744379043579, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "learning_rate": 0.0002863804690131474, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "loss": 0.6487498879432678, + "step": 3900 + }, + { + "ce_loss": 0.23641221225261688, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "distill_loss": 0.13633979856967926, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "ref_ce_loss": 0.21698123216629028, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "loss": 0.7199804782867432, + "step": 3900 + }, + { + "ce_loss": 0.22160635888576508, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "distill_loss": 0.14556357264518738, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "ref_ce_loss": 0.16044962406158447, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "loss": 0.3134487271308899, + "step": 3900 + }, + { + "ce_loss": 0.10460028797388077, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "distill_loss": 0.124151811003685, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "ref_ce_loss": 0.08378157019615173, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "loss": 1.0430158376693726, + "step": 3900 + }, + { + "ce_loss": 0.20616380870342255, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "distill_loss": 0.10245034098625183, + "epoch": 1.3008672448298866, + "step": 3900 + }, + { + "epoch": 1.3008672448298866, + "ref_ce_loss": 0.13835114240646362, + "step": 3900 + }, + { + "epoch": 1.304202801867912, + "loss": 0.6747, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "grad_norm": 3.7162861824035645, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "learning_rate": 0.00028629600766549266, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "loss": 0.5505048036575317, + "step": 3910 + }, + { + "ce_loss": 0.2205231487751007, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "distill_loss": 0.13492049276828766, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "ref_ce_loss": 0.130891352891922, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "loss": 0.622995138168335, + "step": 3910 + }, + { + "ce_loss": 0.23485012352466583, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "distill_loss": 0.16180278360843658, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "ref_ce_loss": 0.22551412880420685, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "loss": 0.4668801426887512, + "step": 3910 + }, + { + "ce_loss": 0.19062143564224243, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "distill_loss": 0.12799620628356934, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "ref_ce_loss": 0.1471155434846878, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "loss": 0.8399578332901001, + "step": 3910 + }, + { + "ce_loss": 0.16672414541244507, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "distill_loss": 0.13798189163208008, + "epoch": 1.304202801867912, + "step": 3910 + }, + { + "epoch": 1.304202801867912, + "ref_ce_loss": 0.1093071699142456, + "step": 3910 + }, + { + "epoch": 1.3075383589059373, + "loss": 0.6499, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "grad_norm": 4.548922538757324, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "learning_rate": 0.00028621129776768424, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "loss": 0.6969051361083984, + "step": 3920 + }, + { + "ce_loss": 0.2939198613166809, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "distill_loss": 0.1349041759967804, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "ref_ce_loss": 0.1603991836309433, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "loss": 0.665176510810852, + "step": 3920 + }, + { + "ce_loss": 0.14564870297908783, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "distill_loss": 0.09071095287799835, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "ref_ce_loss": 0.12003415077924728, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "loss": 0.5591284036636353, + "step": 3920 + }, + { + "ce_loss": 0.2728961706161499, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "distill_loss": 0.11606838554143906, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "ref_ce_loss": 0.12034892290830612, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "loss": 0.6922830939292908, + "step": 3920 + }, + { + "ce_loss": 0.17354288697242737, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "distill_loss": 0.10280630737543106, + "epoch": 1.3075383589059373, + "step": 3920 + }, + { + "epoch": 1.3075383589059373, + "ref_ce_loss": 0.1029202789068222, + "step": 3920 + }, + { + "epoch": 1.3108739159439626, + "loss": 0.5879, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "grad_norm": 1.9159053564071655, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "learning_rate": 0.0002861263394741996, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "loss": 0.5317827463150024, + "step": 3930 + }, + { + "ce_loss": 0.22766584157943726, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "distill_loss": 0.09339602291584015, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "ref_ce_loss": 0.12791801989078522, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "loss": 0.35215145349502563, + "step": 3930 + }, + { + "ce_loss": 0.09111224114894867, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "distill_loss": 0.09845034033060074, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "ref_ce_loss": 0.11704370379447937, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "loss": 0.7495036125183105, + "step": 3930 + }, + { + "ce_loss": 0.11601284891366959, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "distill_loss": 0.14619024097919464, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "ref_ce_loss": 0.09743039309978485, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "loss": 0.6388373374938965, + "step": 3930 + }, + { + "ce_loss": 0.14739064872264862, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "distill_loss": 0.11072908341884613, + "epoch": 1.3108739159439626, + "step": 3930 + }, + { + "epoch": 1.3108739159439626, + "ref_ce_loss": 0.13757041096687317, + "step": 3930 + }, + { + "epoch": 1.314209472981988, + "loss": 0.6759, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "grad_norm": 2.8044283390045166, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "learning_rate": 0.00028604113293996937, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "loss": 0.3935573399066925, + "step": 3940 + }, + { + "ce_loss": 0.1724274754524231, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "distill_loss": 0.10461033880710602, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "ref_ce_loss": 0.11499915271997452, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "loss": 0.5409625172615051, + "step": 3940 + }, + { + "ce_loss": 0.19719979166984558, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "distill_loss": 0.12273137271404266, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "ref_ce_loss": 0.11761221289634705, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "loss": 0.6423748731613159, + "step": 3940 + }, + { + "ce_loss": 0.18682481348514557, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "distill_loss": 0.11395197361707687, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "ref_ce_loss": 0.09778361767530441, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "loss": 0.6339865922927856, + "step": 3940 + }, + { + "ce_loss": 0.15020598471164703, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "distill_loss": 0.11978481709957123, + "epoch": 1.314209472981988, + "step": 3940 + }, + { + "epoch": 1.314209472981988, + "ref_ce_loss": 0.16673775017261505, + "step": 3940 + }, + { + "epoch": 1.3175450300200133, + "loss": 0.6849, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "grad_norm": 7.299792766571045, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "learning_rate": 0.0002859556783203764, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "loss": 0.6421574950218201, + "step": 3950 + }, + { + "ce_loss": 0.2580649256706238, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "distill_loss": 0.15140002965927124, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "ref_ce_loss": 0.1874542534351349, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "loss": 0.6344290375709534, + "step": 3950 + }, + { + "ce_loss": 0.22702381014823914, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "distill_loss": 0.17865008115768433, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "ref_ce_loss": 0.08265173435211182, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "loss": 0.9704930782318115, + "step": 3950 + }, + { + "ce_loss": 0.24519315361976624, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "distill_loss": 0.10518443584442139, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "ref_ce_loss": 0.15663664042949677, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "loss": 0.48585355281829834, + "step": 3950 + }, + { + "ce_loss": 0.1515163630247116, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "distill_loss": 0.17025715112686157, + "epoch": 1.3175450300200133, + "step": 3950 + }, + { + "epoch": 1.3175450300200133, + "ref_ce_loss": 0.11852209270000458, + "step": 3950 + }, + { + "epoch": 1.3208805870580387, + "loss": 0.7196, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "grad_norm": 2.706219434738159, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "learning_rate": 0.00028586997577125634, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "loss": 0.3871728777885437, + "step": 3960 + }, + { + "ce_loss": 0.11327257752418518, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "distill_loss": 0.1323593109846115, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "ref_ce_loss": 0.07896266877651215, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "loss": 0.49185916781425476, + "step": 3960 + }, + { + "ce_loss": 0.20270958542823792, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "distill_loss": 0.12782743573188782, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "ref_ce_loss": 0.09821378439664841, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "loss": 0.8527625799179077, + "step": 3960 + }, + { + "ce_loss": 0.28227877616882324, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "distill_loss": 0.1517748087644577, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "ref_ce_loss": 0.14076222479343414, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "loss": 0.45873206853866577, + "step": 3960 + }, + { + "ce_loss": 0.1606334000825882, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "distill_loss": 0.13742315769195557, + "epoch": 1.3208805870580387, + "step": 3960 + }, + { + "epoch": 1.3208805870580387, + "ref_ce_loss": 0.10351046919822693, + "step": 3960 + }, + { + "epoch": 1.324216144096064, + "loss": 0.639, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "grad_norm": 1.9281312227249146, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "learning_rate": 0.0002857840254488968, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "loss": 0.3973920941352844, + "step": 3970 + }, + { + "ce_loss": 0.12656432390213013, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "distill_loss": 0.1339094638824463, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "ref_ce_loss": 0.1367185264825821, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "loss": 0.6172012090682983, + "step": 3970 + }, + { + "ce_loss": 0.10915721207857132, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "distill_loss": 0.12282588332891464, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "ref_ce_loss": 0.12575297057628632, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "loss": 0.7371445894241333, + "step": 3970 + }, + { + "ce_loss": 0.1586148589849472, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "distill_loss": 0.13597846031188965, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "ref_ce_loss": 0.18562176823616028, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "loss": 0.5300966501235962, + "step": 3970 + }, + { + "ce_loss": 0.14332100749015808, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "distill_loss": 0.11754573881626129, + "epoch": 1.324216144096064, + "step": 3970 + }, + { + "epoch": 1.324216144096064, + "ref_ce_loss": 0.10700936615467072, + "step": 3970 + }, + { + "epoch": 1.3275517011340894, + "loss": 0.6286, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "grad_norm": 3.6821515560150146, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "learning_rate": 0.0002856978275100373, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "loss": 0.6550862789154053, + "step": 3980 + }, + { + "ce_loss": 0.28518232703208923, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "distill_loss": 0.15741953253746033, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "ref_ce_loss": 0.13941185176372528, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "loss": 0.5626774430274963, + "step": 3980 + }, + { + "ce_loss": 0.22388912737369537, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "distill_loss": 0.16154928505420685, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "ref_ce_loss": 0.11383243650197983, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "loss": 0.7328876852989197, + "step": 3980 + }, + { + "ce_loss": 0.21470485627651215, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "distill_loss": 0.15145975351333618, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "ref_ce_loss": 0.1325717270374298, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "loss": 0.8754336833953857, + "step": 3980 + }, + { + "ce_loss": 0.36667221784591675, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "distill_loss": 0.20364922285079956, + "epoch": 1.3275517011340894, + "step": 3980 + }, + { + "epoch": 1.3275517011340894, + "ref_ce_loss": 0.14337895810604095, + "step": 3980 + }, + { + "epoch": 1.3308872581721147, + "loss": 0.6202, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "grad_norm": 2.8255584239959717, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "learning_rate": 0.0002856113821118688, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "loss": 0.9397168159484863, + "step": 3990 + }, + { + "ce_loss": 0.26779523491859436, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "distill_loss": 0.1263412982225418, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "ref_ce_loss": 0.12941111624240875, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "loss": 1.023170828819275, + "step": 3990 + }, + { + "ce_loss": 0.368866503238678, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "distill_loss": 0.13925743103027344, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "ref_ce_loss": 0.2336951643228531, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "loss": 0.539009153842926, + "step": 3990 + }, + { + "ce_loss": 0.19066128134727478, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "distill_loss": 0.13218340277671814, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "ref_ce_loss": 0.21576425433158875, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "loss": 0.4518980085849762, + "step": 3990 + }, + { + "ce_loss": 0.17046760022640228, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "distill_loss": 0.11611610651016235, + "epoch": 1.3308872581721147, + "step": 3990 + }, + { + "epoch": 1.3308872581721147, + "ref_ce_loss": 0.16487550735473633, + "step": 3990 + }, + { + "epoch": 1.33422281521014, + "loss": 0.6956, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "grad_norm": 5.867918968200684, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "learning_rate": 0.00028552468941203364, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "loss": 0.5843743085861206, + "step": 4000 + }, + { + "ce_loss": 0.18848250806331635, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "distill_loss": 0.11338036507368088, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "ref_ce_loss": 0.17525966465473175, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "loss": 1.891869068145752, + "step": 4000 + }, + { + "ce_loss": 0.308139830827713, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "distill_loss": 0.1472199261188507, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "ref_ce_loss": 0.19832171499729156, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "loss": 0.421846479177475, + "step": 4000 + }, + { + "ce_loss": 0.18127351999282837, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "distill_loss": 0.11779454350471497, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "ref_ce_loss": 0.12023581564426422, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "loss": 0.7354398369789124, + "step": 4000 + }, + { + "ce_loss": 0.24078884720802307, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "distill_loss": 0.16594208776950836, + "epoch": 1.33422281521014, + "step": 4000 + }, + { + "epoch": 1.33422281521014, + "ref_ce_loss": 0.15056820213794708, + "step": 4000 + }, + { + "epoch": 1.3375583722481654, + "loss": 0.7304, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "grad_norm": 2.075334072113037, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "learning_rate": 0.0002854377495686252, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "loss": 0.5401085615158081, + "step": 4010 + }, + { + "ce_loss": 0.24888886511325836, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "distill_loss": 0.10953701287508011, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "ref_ce_loss": 0.10295696556568146, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "loss": 0.48950207233428955, + "step": 4010 + }, + { + "ce_loss": 0.2334887534379959, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "distill_loss": 0.1012965738773346, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "ref_ce_loss": 0.11197902262210846, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "loss": 0.5430691242218018, + "step": 4010 + }, + { + "ce_loss": 0.19202624261379242, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "distill_loss": 0.11316399276256561, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "ref_ce_loss": 0.13813838362693787, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "loss": 0.5464562177658081, + "step": 4010 + }, + { + "ce_loss": 0.2877531051635742, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "distill_loss": 0.1209205761551857, + "epoch": 1.3375583722481654, + "step": 4010 + }, + { + "epoch": 1.3375583722481654, + "ref_ce_loss": 0.1376003623008728, + "step": 4010 + }, + { + "epoch": 1.3408939292861908, + "loss": 0.5972, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "grad_norm": 2.555873155593872, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "learning_rate": 0.0002853505627401873, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "loss": 0.9355064630508423, + "step": 4020 + }, + { + "ce_loss": 0.26096606254577637, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "distill_loss": 0.17161086201667786, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "ref_ce_loss": 0.23946815729141235, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "loss": 1.841139316558838, + "step": 4020 + }, + { + "ce_loss": 0.791628360748291, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "distill_loss": 0.15540112555027008, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "ref_ce_loss": 0.3392656743526459, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "loss": 1.5659822225570679, + "step": 4020 + }, + { + "ce_loss": 0.5201265811920166, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "distill_loss": 0.11186160892248154, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "ref_ce_loss": 0.272470086812973, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "loss": 0.9584152698516846, + "step": 4020 + }, + { + "ce_loss": 0.46013176441192627, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "distill_loss": 0.15966302156448364, + "epoch": 1.3408939292861908, + "step": 4020 + }, + { + "epoch": 1.3408939292861908, + "ref_ce_loss": 0.25039541721343994, + "step": 4020 + }, + { + "epoch": 1.3442294863242161, + "loss": 0.7497, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "grad_norm": 12.790151596069336, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "learning_rate": 0.00028526312908571446, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "loss": 0.656446099281311, + "step": 4030 + }, + { + "ce_loss": 0.15182964503765106, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "distill_loss": 0.1116434782743454, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "ref_ce_loss": 0.12290992587804794, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "loss": 0.40624871850013733, + "step": 4030 + }, + { + "ce_loss": 0.1445799022912979, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "distill_loss": 0.0997803807258606, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "ref_ce_loss": 0.11327410489320755, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "loss": 0.6003686189651489, + "step": 4030 + }, + { + "ce_loss": 0.1528758853673935, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "distill_loss": 0.11801660060882568, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "ref_ce_loss": 0.1294703334569931, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "loss": 0.6489979028701782, + "step": 4030 + }, + { + "ce_loss": 0.2071543037891388, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "distill_loss": 0.139252707362175, + "epoch": 1.3442294863242161, + "step": 4030 + }, + { + "epoch": 1.3442294863242161, + "ref_ce_loss": 0.1136828362941742, + "step": 4030 + }, + { + "epoch": 1.3475650433622415, + "loss": 0.6939, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "grad_norm": 3.448014259338379, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "learning_rate": 0.00028517544876465107, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "loss": 0.5830159783363342, + "step": 4040 + }, + { + "ce_loss": 0.22778114676475525, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "distill_loss": 0.13242411613464355, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "ref_ce_loss": 0.13737395405769348, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "loss": 0.3476133346557617, + "step": 4040 + }, + { + "ce_loss": 0.129276305437088, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "distill_loss": 0.1174379512667656, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "ref_ce_loss": 0.0901045873761177, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "loss": 0.6042814254760742, + "step": 4040 + }, + { + "ce_loss": 0.25413164496421814, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "distill_loss": 0.16901132464408875, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "ref_ce_loss": 0.1391698718070984, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "loss": 0.3692128360271454, + "step": 4040 + }, + { + "ce_loss": 0.11316437274217606, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "distill_loss": 0.11189061403274536, + "epoch": 1.3475650433622415, + "step": 4040 + }, + { + "epoch": 1.3475650433622415, + "ref_ce_loss": 0.08131895214319229, + "step": 4040 + }, + { + "epoch": 1.3509006004002668, + "loss": 0.6413, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "grad_norm": 2.262253761291504, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "learning_rate": 0.00028508752193689155, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "loss": 0.4915662407875061, + "step": 4050 + }, + { + "ce_loss": 0.22006672620773315, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "distill_loss": 0.14548489451408386, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "ref_ce_loss": 0.12591445446014404, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "loss": 0.8049219846725464, + "step": 4050 + }, + { + "ce_loss": 0.16947396099567413, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "distill_loss": 0.1450093388557434, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "ref_ce_loss": 0.10676059871912003, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "loss": 0.47339928150177, + "step": 4050 + }, + { + "ce_loss": 0.1769871562719345, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "distill_loss": 0.14859752357006073, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "ref_ce_loss": 0.14764495193958282, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "loss": 0.47751790285110474, + "step": 4050 + }, + { + "ce_loss": 0.1551894247531891, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "distill_loss": 0.13194677233695984, + "epoch": 1.3509006004002668, + "step": 4050 + }, + { + "epoch": 1.3509006004002668, + "ref_ce_loss": 0.10842660814523697, + "step": 4050 + }, + { + "epoch": 1.3542361574382922, + "loss": 0.6347, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "grad_norm": 2.2766051292419434, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "learning_rate": 0.0002849993487627797, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "loss": 0.8118981122970581, + "step": 4060 + }, + { + "ce_loss": 0.1651322841644287, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "distill_loss": 0.10374452918767929, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "ref_ce_loss": 0.11030875891447067, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "loss": 0.5269935727119446, + "step": 4060 + }, + { + "ce_loss": 0.25401464104652405, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "distill_loss": 0.11832722276449203, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "ref_ce_loss": 0.15434080362319946, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "loss": 0.4596205949783325, + "step": 4060 + }, + { + "ce_loss": 0.19093306362628937, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "distill_loss": 0.11900342255830765, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "ref_ce_loss": 0.1062200739979744, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "loss": 0.43231284618377686, + "step": 4060 + }, + { + "ce_loss": 0.1506810486316681, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "distill_loss": 0.10709468275308609, + "epoch": 1.3542361574382922, + "step": 4060 + }, + { + "epoch": 1.3542361574382922, + "ref_ce_loss": 0.10686808824539185, + "step": 4060 + }, + { + "epoch": 1.3575717144763175, + "loss": 0.632, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "grad_norm": 2.651055335998535, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "learning_rate": 0.0002849109294031085, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "loss": 0.4504760205745697, + "step": 4070 + }, + { + "ce_loss": 0.1254345029592514, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "distill_loss": 0.09903445094823837, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "ref_ce_loss": 0.1227196678519249, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "loss": 0.8664339780807495, + "step": 4070 + }, + { + "ce_loss": 0.265777051448822, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "distill_loss": 0.131103977560997, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "ref_ce_loss": 0.14062733948230743, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "loss": 0.5157327651977539, + "step": 4070 + }, + { + "ce_loss": 0.14958372712135315, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "distill_loss": 0.10253413021564484, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "ref_ce_loss": 0.1118377149105072, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "loss": 0.3796268105506897, + "step": 4070 + }, + { + "ce_loss": 0.16047704219818115, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "distill_loss": 0.10889697074890137, + "epoch": 1.3575717144763175, + "step": 4070 + }, + { + "epoch": 1.3575717144763175, + "ref_ce_loss": 0.11008761078119278, + "step": 4070 + }, + { + "epoch": 1.3609072715143429, + "loss": 0.7058, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "grad_norm": 3.449279308319092, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "learning_rate": 0.00028482226401912016, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "loss": 0.780163049697876, + "step": 4080 + }, + { + "ce_loss": 0.33349332213401794, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "distill_loss": 0.1183605045080185, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "ref_ce_loss": 0.161131352186203, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "loss": 0.6341320276260376, + "step": 4080 + }, + { + "ce_loss": 0.17753848433494568, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "distill_loss": 0.10119305551052094, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "ref_ce_loss": 0.1781836748123169, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "loss": 0.8396756052970886, + "step": 4080 + }, + { + "ce_loss": 0.21004533767700195, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "distill_loss": 0.1059829443693161, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "ref_ce_loss": 0.18439234793186188, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "loss": 0.5295935869216919, + "step": 4080 + }, + { + "ce_loss": 0.2418661117553711, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "distill_loss": 0.11171921342611313, + "epoch": 1.3609072715143429, + "step": 4080 + }, + { + "epoch": 1.3609072715143429, + "ref_ce_loss": 0.11636696010828018, + "step": 4080 + }, + { + "epoch": 1.3642428285523682, + "loss": 0.6155, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "grad_norm": 2.1769909858703613, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "learning_rate": 0.00028473335277250534, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "loss": 0.4903789162635803, + "step": 4090 + }, + { + "ce_loss": 0.1835421472787857, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "distill_loss": 0.07726556807756424, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "ref_ce_loss": 0.13395462930202484, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "loss": 0.5141618251800537, + "step": 4090 + }, + { + "ce_loss": 0.21861040592193604, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "distill_loss": 0.09555073082447052, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "ref_ce_loss": 0.13811346888542175, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "loss": 0.7884995341300964, + "step": 4090 + }, + { + "ce_loss": 0.4218871593475342, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "distill_loss": 0.1013653576374054, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "ref_ce_loss": 0.19294792413711548, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "loss": 0.6917357444763184, + "step": 4090 + }, + { + "ce_loss": 0.2670474946498871, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "distill_loss": 0.11097948253154755, + "epoch": 1.3642428285523682, + "step": 4090 + }, + { + "epoch": 1.3642428285523682, + "ref_ce_loss": 0.12580280005931854, + "step": 4090 + }, + { + "epoch": 1.3675783855903936, + "loss": 0.6481, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "grad_norm": 2.9323387145996094, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "learning_rate": 0.00028464419582540295, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "loss": 0.6545873880386353, + "step": 4100 + }, + { + "ce_loss": 0.3142815828323364, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "distill_loss": 0.13354846835136414, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "ref_ce_loss": 0.17563565075397491, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "loss": 0.8540979623794556, + "step": 4100 + }, + { + "ce_loss": 0.21789871156215668, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "distill_loss": 0.12937171757221222, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "ref_ce_loss": 0.15378691256046295, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "loss": 0.7798890471458435, + "step": 4100 + }, + { + "ce_loss": 0.3511600196361542, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "distill_loss": 0.14735189080238342, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "ref_ce_loss": 0.15318696200847626, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "loss": 0.5215632915496826, + "step": 4100 + }, + { + "ce_loss": 0.15824168920516968, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "distill_loss": 0.10788905620574951, + "epoch": 1.3675783855903936, + "step": 4100 + }, + { + "epoch": 1.3675783855903936, + "ref_ce_loss": 0.11126714199781418, + "step": 4100 + }, + { + "epoch": 1.370913942628419, + "loss": 0.6594, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "grad_norm": 1.8464733362197876, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "learning_rate": 0.0002845547933404002, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "loss": 0.34922516345977783, + "step": 4110 + }, + { + "ce_loss": 0.15206386148929596, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "distill_loss": 0.08681447803974152, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "ref_ce_loss": 0.10820655524730682, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "loss": 0.3788524568080902, + "step": 4110 + }, + { + "ce_loss": 0.15653900802135468, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "distill_loss": 0.09988351911306381, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "ref_ce_loss": 0.12210514396429062, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "loss": 0.6386069655418396, + "step": 4110 + }, + { + "ce_loss": 0.16010144352912903, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "distill_loss": 0.13131670653820038, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "ref_ce_loss": 0.12684138119220734, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "loss": 0.5312026143074036, + "step": 4110 + }, + { + "ce_loss": 0.2253277450799942, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "distill_loss": 0.10457585752010345, + "epoch": 1.370913942628419, + "step": 4110 + }, + { + "epoch": 1.370913942628419, + "ref_ce_loss": 0.1474202573299408, + "step": 4110 + }, + { + "epoch": 1.3742494996664443, + "loss": 0.5724, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "grad_norm": 2.848768949508667, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "learning_rate": 0.00028446514548053194, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "loss": 0.35788092017173767, + "step": 4120 + }, + { + "ce_loss": 0.08504143357276917, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "distill_loss": 0.08752532303333282, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "ref_ce_loss": 0.0958314761519432, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "loss": 1.4491925239562988, + "step": 4120 + }, + { + "ce_loss": 0.3018847107887268, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "distill_loss": 0.14247466623783112, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "ref_ce_loss": 0.14462648332118988, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "loss": 0.3580704927444458, + "step": 4120 + }, + { + "ce_loss": 0.12576836347579956, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "distill_loss": 0.11499390751123428, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "ref_ce_loss": 0.1170811876654625, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "loss": 0.6562156081199646, + "step": 4120 + }, + { + "ce_loss": 0.2465030401945114, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "distill_loss": 0.12957395613193512, + "epoch": 1.3742494996664443, + "step": 4120 + }, + { + "epoch": 1.3742494996664443, + "ref_ce_loss": 0.13839758932590485, + "step": 4120 + }, + { + "epoch": 1.3775850567044696, + "loss": 0.6316, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "grad_norm": 2.6617259979248047, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "learning_rate": 0.0002843752524092805, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "loss": 0.5867189764976501, + "step": 4130 + }, + { + "ce_loss": 0.22917450964450836, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "distill_loss": 0.10425989329814911, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "ref_ce_loss": 0.21919582784175873, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "loss": 0.7949258685112, + "step": 4130 + }, + { + "ce_loss": 0.29066002368927, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "distill_loss": 0.10276904702186584, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "ref_ce_loss": 0.1894405633211136, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "loss": 0.5634397268295288, + "step": 4130 + }, + { + "ce_loss": 0.204814150929451, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "distill_loss": 0.09537995606660843, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "ref_ce_loss": 0.13756123185157776, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "loss": 0.7808976769447327, + "step": 4130 + }, + { + "ce_loss": 0.25047364830970764, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "distill_loss": 0.12359421700239182, + "epoch": 1.3775850567044696, + "step": 4130 + }, + { + "epoch": 1.3775850567044696, + "ref_ce_loss": 0.22865727543830872, + "step": 4130 + }, + { + "epoch": 1.380920613742495, + "loss": 0.6169, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "grad_norm": 2.3741250038146973, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "learning_rate": 0.0002842851142905754, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "loss": 0.8009947538375854, + "step": 4140 + }, + { + "ce_loss": 0.3151392638683319, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "distill_loss": 0.13409563899040222, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "ref_ce_loss": 0.17703871428966522, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "loss": 0.525648295879364, + "step": 4140 + }, + { + "ce_loss": 0.203111931681633, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "distill_loss": 0.09106729179620743, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "ref_ce_loss": 0.1332019865512848, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "loss": 0.4340347349643707, + "step": 4140 + }, + { + "ce_loss": 0.14689216017723083, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "distill_loss": 0.1072496846318245, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "ref_ce_loss": 0.12199627608060837, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "loss": 0.6135590076446533, + "step": 4140 + }, + { + "ce_loss": 0.24815963208675385, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "distill_loss": 0.09719424694776535, + "epoch": 1.380920613742495, + "step": 4140 + }, + { + "epoch": 1.380920613742495, + "ref_ce_loss": 0.2031925618648529, + "step": 4140 + }, + { + "epoch": 1.3842561707805203, + "loss": 0.6132, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "grad_norm": 2.4850473403930664, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "learning_rate": 0.0002841947312887929, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "loss": 0.6122986078262329, + "step": 4150 + }, + { + "ce_loss": 0.19411823153495789, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "distill_loss": 0.09509175270795822, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "ref_ce_loss": 0.12494823336601257, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "loss": 0.2789407968521118, + "step": 4150 + }, + { + "ce_loss": 0.11748460680246353, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "distill_loss": 0.09389721602201462, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "ref_ce_loss": 0.0674012303352356, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "loss": 0.4519701600074768, + "step": 4150 + }, + { + "ce_loss": 0.22965377569198608, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "distill_loss": 0.07461632788181305, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "ref_ce_loss": 0.14758454263210297, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "loss": 0.3582833409309387, + "step": 4150 + }, + { + "ce_loss": 0.0969460979104042, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "distill_loss": 0.07818230241537094, + "epoch": 1.3842561707805203, + "step": 4150 + }, + { + "epoch": 1.3842561707805203, + "ref_ce_loss": 0.1007329449057579, + "step": 4150 + }, + { + "epoch": 1.3875917278185457, + "loss": 0.6136, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "grad_norm": 2.14587664604187, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "learning_rate": 0.00028410410356875614, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "loss": 0.6464195251464844, + "step": 4160 + }, + { + "ce_loss": 0.2074149250984192, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "distill_loss": 0.10325966775417328, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "ref_ce_loss": 0.12344758957624435, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "loss": 0.7069451212882996, + "step": 4160 + }, + { + "ce_loss": 0.1398283839225769, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "distill_loss": 0.10599468648433685, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "ref_ce_loss": 0.15541532635688782, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "loss": 0.52448970079422, + "step": 4160 + }, + { + "ce_loss": 0.18425686657428741, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "distill_loss": 0.10802476853132248, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "ref_ce_loss": 0.1491173356771469, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "loss": 0.5847429037094116, + "step": 4160 + }, + { + "ce_loss": 0.18347282707691193, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "distill_loss": 0.09988974034786224, + "epoch": 1.3875917278185457, + "step": 4160 + }, + { + "epoch": 1.3875917278185457, + "ref_ce_loss": 0.19402439892292023, + "step": 4160 + }, + { + "epoch": 1.390927284856571, + "loss": 0.651, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "grad_norm": 4.222642421722412, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "learning_rate": 0.00028401323129573415, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "loss": 0.5438768863677979, + "step": 4170 + }, + { + "ce_loss": 0.24094192683696747, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "distill_loss": 0.10817679762840271, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "ref_ce_loss": 0.15798398852348328, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "loss": 0.7662238478660583, + "step": 4170 + }, + { + "ce_loss": 0.28996947407722473, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "distill_loss": 0.12934328615665436, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "ref_ce_loss": 0.1425507664680481, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "loss": 0.790197491645813, + "step": 4170 + }, + { + "ce_loss": 0.18555812537670135, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "distill_loss": 0.08629944920539856, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "ref_ce_loss": 0.12687258422374725, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "loss": 0.6215685606002808, + "step": 4170 + }, + { + "ce_loss": 0.21749264001846313, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "distill_loss": 0.09972754865884781, + "epoch": 1.390927284856571, + "step": 4170 + }, + { + "epoch": 1.390927284856571, + "ref_ce_loss": 0.10281194746494293, + "step": 4170 + }, + { + "epoch": 1.3942628418945964, + "loss": 0.5929, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "grad_norm": 3.127805709838867, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "learning_rate": 0.00028392211463544224, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "loss": 1.1103284358978271, + "step": 4180 + }, + { + "ce_loss": 0.14541105926036835, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "distill_loss": 0.08611320704221725, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "ref_ce_loss": 0.1386411339044571, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "loss": 0.7187755107879639, + "step": 4180 + }, + { + "ce_loss": 0.12636052072048187, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "distill_loss": 0.08641035109758377, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "ref_ce_loss": 0.09005328267812729, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "loss": 0.580119252204895, + "step": 4180 + }, + { + "ce_loss": 0.16198816895484924, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "distill_loss": 0.07411843538284302, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "ref_ce_loss": 0.13253140449523926, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "loss": 0.5698190927505493, + "step": 4180 + }, + { + "ce_loss": 0.2511466443538666, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "distill_loss": 0.11713631451129913, + "epoch": 1.3942628418945964, + "step": 4180 + }, + { + "epoch": 1.3942628418945964, + "ref_ce_loss": 0.08613384515047073, + "step": 4180 + }, + { + "epoch": 1.3975983989326217, + "loss": 0.6207, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "grad_norm": 2.769015312194824, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "learning_rate": 0.0002838307537540411, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "loss": 0.7650259733200073, + "step": 4190 + }, + { + "ce_loss": 0.2717476189136505, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "distill_loss": 0.0811992660164833, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "ref_ce_loss": 0.21618877351284027, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "loss": 0.9913129806518555, + "step": 4190 + }, + { + "ce_loss": 0.22515885531902313, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "distill_loss": 0.10443403571844101, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "ref_ce_loss": 0.1330164223909378, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "loss": 0.458392858505249, + "step": 4190 + }, + { + "ce_loss": 0.2054453045129776, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "distill_loss": 0.08475649356842041, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "ref_ce_loss": 0.11416659504175186, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "loss": 0.9844976663589478, + "step": 4190 + }, + { + "ce_loss": 0.2999069392681122, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "distill_loss": 0.10008185356855392, + "epoch": 1.3975983989326217, + "step": 4190 + }, + { + "epoch": 1.3975983989326217, + "ref_ce_loss": 0.1291126161813736, + "step": 4190 + }, + { + "epoch": 1.400933955970647, + "loss": 0.6316, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "grad_norm": 1.8868513107299805, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "learning_rate": 0.00028373914881813715, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "loss": 0.910756528377533, + "step": 4200 + }, + { + "ce_loss": 0.16571709513664246, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "distill_loss": 0.09719116985797882, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "ref_ce_loss": 0.1444815844297409, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "loss": 0.41164451837539673, + "step": 4200 + }, + { + "ce_loss": 0.12055205553770065, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "distill_loss": 0.0802890807390213, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "ref_ce_loss": 0.07115507870912552, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "loss": 0.6578470468521118, + "step": 4200 + }, + { + "ce_loss": 0.11765425652265549, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "distill_loss": 0.11294358223676682, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "ref_ce_loss": 0.16021910309791565, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "loss": 0.5982992649078369, + "step": 4200 + }, + { + "ce_loss": 0.2084062397480011, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "distill_loss": 0.1283988654613495, + "epoch": 1.400933955970647, + "step": 4200 + }, + { + "epoch": 1.400933955970647, + "ref_ce_loss": 0.18639861047267914, + "step": 4200 + }, + { + "epoch": 1.4042695130086724, + "loss": 0.6715, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "grad_norm": 2.1879544258117676, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "learning_rate": 0.00028364729999478145, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "loss": 0.6384764909744263, + "step": 4210 + }, + { + "ce_loss": 0.2525661289691925, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "distill_loss": 0.09707111120223999, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "ref_ce_loss": 0.18559350073337555, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "loss": 0.5137649178504944, + "step": 4210 + }, + { + "ce_loss": 0.20408010482788086, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "distill_loss": 0.12385561317205429, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "ref_ce_loss": 0.1269007921218872, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "loss": 0.5365128517150879, + "step": 4210 + }, + { + "ce_loss": 0.2322646677494049, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "distill_loss": 0.1266290843486786, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "ref_ce_loss": 0.12365609407424927, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "loss": 0.30896055698394775, + "step": 4210 + }, + { + "ce_loss": 0.10330818593502045, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "distill_loss": 0.0924607366323471, + "epoch": 1.4042695130086724, + "step": 4210 + }, + { + "epoch": 1.4042695130086724, + "ref_ce_loss": 0.11242125183343887, + "step": 4210 + }, + { + "epoch": 1.4076050700466978, + "loss": 0.6023, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "grad_norm": 3.4202616214752197, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "learning_rate": 0.0002835552074514702, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "loss": 0.3378927707672119, + "step": 4220 + }, + { + "ce_loss": 0.07960563153028488, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "distill_loss": 0.0758289247751236, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "ref_ce_loss": 0.06120488420128822, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "loss": 0.4977499842643738, + "step": 4220 + }, + { + "ce_loss": 0.19765213131904602, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "distill_loss": 0.10244348645210266, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "ref_ce_loss": 0.15283609926700592, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "loss": 0.6151387691497803, + "step": 4220 + }, + { + "ce_loss": 0.2646377980709076, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "distill_loss": 0.1459384262561798, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "ref_ce_loss": 0.1401008814573288, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "loss": 0.5750916004180908, + "step": 4220 + }, + { + "ce_loss": 0.2228243350982666, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "distill_loss": 0.09344765543937683, + "epoch": 1.4076050700466978, + "step": 4220 + }, + { + "epoch": 1.4076050700466978, + "ref_ce_loss": 0.1685912162065506, + "step": 4220 + }, + { + "epoch": 1.4109406270847231, + "loss": 0.6674, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "grad_norm": 2.043844223022461, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "learning_rate": 0.00028346287135614376, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "loss": 0.5440961122512817, + "step": 4230 + }, + { + "ce_loss": 0.21927763521671295, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "distill_loss": 0.09879957139492035, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "ref_ce_loss": 0.11110997945070267, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "loss": 0.6085052490234375, + "step": 4230 + }, + { + "ce_loss": 0.2495298832654953, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "distill_loss": 0.12600085139274597, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "ref_ce_loss": 0.1036033183336258, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "loss": 0.7630167007446289, + "step": 4230 + }, + { + "ce_loss": 0.21309255063533783, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "distill_loss": 0.11631793528795242, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "ref_ce_loss": 0.14728353917598724, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "loss": 0.3512745201587677, + "step": 4230 + }, + { + "ce_loss": 0.15760624408721924, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "distill_loss": 0.08606228977441788, + "epoch": 1.4109406270847231, + "step": 4230 + }, + { + "epoch": 1.4109406270847231, + "ref_ce_loss": 0.10728107392787933, + "step": 4230 + }, + { + "epoch": 1.4142761841227485, + "loss": 0.6071, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "grad_norm": 3.563107967376709, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "learning_rate": 0.0002833702918771868, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "loss": 0.6661812663078308, + "step": 4240 + }, + { + "ce_loss": 0.2621847689151764, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "distill_loss": 0.12339521944522858, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "ref_ce_loss": 0.11897028237581253, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "loss": 0.3537221848964691, + "step": 4240 + }, + { + "ce_loss": 0.12370649725198746, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "distill_loss": 0.08683275431394577, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "ref_ce_loss": 0.08618742972612381, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "loss": 1.1888618469238281, + "step": 4240 + }, + { + "ce_loss": 0.35263460874557495, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "distill_loss": 0.15241080522537231, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "ref_ce_loss": 0.19804182648658752, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "loss": 0.6669833064079285, + "step": 4240 + }, + { + "ce_loss": 0.19273287057876587, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "distill_loss": 0.11122193187475204, + "epoch": 1.4142761841227485, + "step": 4240 + }, + { + "epoch": 1.4142761841227485, + "ref_ce_loss": 0.159165620803833, + "step": 4240 + }, + { + "epoch": 1.4176117411607738, + "loss": 0.596, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "grad_norm": 3.0415971279144287, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "learning_rate": 0.00028327746918342764, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "loss": 0.6574098467826843, + "step": 4250 + }, + { + "ce_loss": 0.35821470618247986, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "distill_loss": 0.10992508381605148, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "ref_ce_loss": 0.14264172315597534, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "loss": 0.7249197959899902, + "step": 4250 + }, + { + "ce_loss": 0.32912251353263855, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "distill_loss": 0.11514975875616074, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "ref_ce_loss": 0.1938469558954239, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "loss": 0.790244460105896, + "step": 4250 + }, + { + "ce_loss": 0.29465973377227783, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "distill_loss": 0.09840358048677444, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "ref_ce_loss": 0.1359245926141739, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "loss": 0.7141227722167969, + "step": 4250 + }, + { + "ce_loss": 0.3344563841819763, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "distill_loss": 0.11084728688001633, + "epoch": 1.4176117411607738, + "step": 4250 + }, + { + "epoch": 1.4176117411607738, + "ref_ce_loss": 0.17706482112407684, + "step": 4250 + }, + { + "epoch": 1.4209472981987992, + "loss": 0.6059, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "grad_norm": 2.2450828552246094, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "learning_rate": 0.0002831844034441384, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "loss": 0.878659725189209, + "step": 4260 + }, + { + "ce_loss": 0.29933464527130127, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "distill_loss": 0.1180991381406784, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "ref_ce_loss": 0.1688729077577591, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "loss": 0.820519745349884, + "step": 4260 + }, + { + "ce_loss": 0.38678011298179626, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "distill_loss": 0.12214456498622894, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "ref_ce_loss": 0.1592690348625183, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "loss": 0.5947272777557373, + "step": 4260 + }, + { + "ce_loss": 0.23920579254627228, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "distill_loss": 0.09341529756784439, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "ref_ce_loss": 0.15410996973514557, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "loss": 0.5958114862442017, + "step": 4260 + }, + { + "ce_loss": 0.22531208395957947, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "distill_loss": 0.10374058037996292, + "epoch": 1.4209472981987992, + "step": 4260 + }, + { + "epoch": 1.4209472981987992, + "ref_ce_loss": 0.16780097782611847, + "step": 4260 + }, + { + "epoch": 1.4242828552368245, + "loss": 0.591, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "grad_norm": 2.1468281745910645, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "learning_rate": 0.0002830910948290343, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "loss": 0.4912702739238739, + "step": 4270 + }, + { + "ce_loss": 0.15433534979820251, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "distill_loss": 0.09245024621486664, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "ref_ce_loss": 0.10943099856376648, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "loss": 0.5627715587615967, + "step": 4270 + }, + { + "ce_loss": 0.20639285445213318, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "distill_loss": 0.10644049197435379, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "ref_ce_loss": 0.12591655552387238, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "loss": 0.43795809149742126, + "step": 4270 + }, + { + "ce_loss": 0.21394628286361694, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "distill_loss": 0.10047397017478943, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "ref_ce_loss": 0.12295599281787872, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "loss": 0.5432189702987671, + "step": 4270 + }, + { + "ce_loss": 0.16253937780857086, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "distill_loss": 0.11227938532829285, + "epoch": 1.4242828552368245, + "step": 4270 + }, + { + "epoch": 1.4242828552368245, + "ref_ce_loss": 0.14385241270065308, + "step": 4270 + }, + { + "epoch": 1.4276184122748499, + "loss": 0.9214, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "grad_norm": 4.369274616241455, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "learning_rate": 0.00028299754350827333, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "loss": 1.351196527481079, + "step": 4280 + }, + { + "ce_loss": 0.18473805487155914, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "distill_loss": 0.4738437831401825, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "ref_ce_loss": 0.16030266880989075, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "loss": 1.2438302040100098, + "step": 4280 + }, + { + "ce_loss": 0.26313990354537964, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "distill_loss": 0.6745567321777344, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "ref_ce_loss": 0.19730761647224426, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "loss": 2.1819138526916504, + "step": 4280 + }, + { + "ce_loss": 0.30457887053489685, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "distill_loss": 0.563275933265686, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "ref_ce_loss": 0.16263462603092194, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "loss": 0.6994156837463379, + "step": 4280 + }, + { + "ce_loss": 0.14273491501808167, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "distill_loss": 0.3719584345817566, + "epoch": 1.4276184122748499, + "step": 4280 + }, + { + "epoch": 1.4276184122748499, + "ref_ce_loss": 0.13406719267368317, + "step": 4280 + }, + { + "epoch": 1.4309539693128752, + "loss": 0.9138, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "grad_norm": 3.0709683895111084, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "learning_rate": 0.00028290374965245625, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "loss": 0.8565329909324646, + "step": 4290 + }, + { + "ce_loss": 0.26970455050468445, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "distill_loss": 0.3865727484226227, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "ref_ce_loss": 0.15481248497962952, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "loss": 0.8524794578552246, + "step": 4290 + }, + { + "ce_loss": 0.3532913625240326, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "distill_loss": 0.355815589427948, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "ref_ce_loss": 0.14321820437908173, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "loss": 0.5584589242935181, + "step": 4290 + }, + { + "ce_loss": 0.18174344301223755, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "distill_loss": 0.24146713316440582, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "ref_ce_loss": 0.13489829003810883, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "loss": 0.9188706874847412, + "step": 4290 + }, + { + "ce_loss": 0.2061683088541031, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "distill_loss": 0.24299702048301697, + "epoch": 1.4309539693128752, + "step": 4290 + }, + { + "epoch": 1.4309539693128752, + "ref_ce_loss": 0.1520266830921173, + "step": 4290 + }, + { + "epoch": 1.4342895263509006, + "loss": 0.7849, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "grad_norm": 3.8154234886169434, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "learning_rate": 0.0002828097134326261, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "loss": 0.8924685716629028, + "step": 4300 + }, + { + "ce_loss": 0.22914117574691772, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "distill_loss": 0.32076436281204224, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "ref_ce_loss": 0.2145729959011078, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "loss": 0.9288495779037476, + "step": 4300 + }, + { + "ce_loss": 0.28972870111465454, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "distill_loss": 0.4693968892097473, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "ref_ce_loss": 0.11189575493335724, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "loss": 0.7833391427993774, + "step": 4300 + }, + { + "ce_loss": 0.21527405083179474, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "distill_loss": 0.2836889922618866, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "ref_ce_loss": 0.1465625911951065, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "loss": 0.5358696579933167, + "step": 4300 + }, + { + "ce_loss": 0.1487964242696762, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "distill_loss": 0.2489507794380188, + "epoch": 1.4342895263509006, + "step": 4300 + }, + { + "epoch": 1.4342895263509006, + "ref_ce_loss": 0.09456692636013031, + "step": 4300 + }, + { + "epoch": 1.437625083388926, + "loss": 0.8125, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "grad_norm": 3.7421693801879883, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "learning_rate": 0.0002827154350202678, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "loss": 0.7215092778205872, + "step": 4310 + }, + { + "ce_loss": 0.19476166367530823, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "distill_loss": 0.2617197632789612, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "ref_ce_loss": 0.1460052728652954, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "loss": 0.44765257835388184, + "step": 4310 + }, + { + "ce_loss": 0.0965539962053299, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "distill_loss": 0.24878545105457306, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "ref_ce_loss": 0.1019732877612114, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "loss": 0.609957754611969, + "step": 4310 + }, + { + "ce_loss": 0.18919765949249268, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "distill_loss": 0.27180278301239014, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "ref_ce_loss": 0.11347243934869766, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "loss": 0.624266505241394, + "step": 4310 + }, + { + "ce_loss": 0.2113901674747467, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "distill_loss": 0.22982464730739594, + "epoch": 1.437625083388926, + "step": 4310 + }, + { + "epoch": 1.437625083388926, + "ref_ce_loss": 0.11574660986661911, + "step": 4310 + }, + { + "epoch": 1.4409606404269513, + "loss": 0.7445, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "grad_norm": 3.571347713470459, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "learning_rate": 0.000282620914587308, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "loss": 0.48458024859428406, + "step": 4320 + }, + { + "ce_loss": 0.1750793755054474, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "distill_loss": 0.19086973369121552, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "ref_ce_loss": 0.11848335713148117, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "loss": 0.6750709414482117, + "step": 4320 + }, + { + "ce_loss": 0.23150256276130676, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "distill_loss": 0.22192597389221191, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "ref_ce_loss": 0.1723523885011673, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "loss": 0.7323600053787231, + "step": 4320 + }, + { + "ce_loss": 0.3054851293563843, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "distill_loss": 0.23578448593616486, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "ref_ce_loss": 0.19094038009643555, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "loss": 0.7869250178337097, + "step": 4320 + }, + { + "ce_loss": 0.27868741750717163, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "distill_loss": 0.2345450520515442, + "epoch": 1.4409606404269513, + "step": 4320 + }, + { + "epoch": 1.4409606404269513, + "ref_ce_loss": 0.18001769483089447, + "step": 4320 + }, + { + "epoch": 1.4442961974649766, + "loss": 0.6926, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "grad_norm": 2.8783340454101562, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "learning_rate": 0.0002825261523061146, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "loss": 1.5428657531738281, + "step": 4330 + }, + { + "ce_loss": 0.20685748755931854, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "distill_loss": 0.15463702380657196, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "ref_ce_loss": 0.14646972715854645, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "loss": 1.6755146980285645, + "step": 4330 + }, + { + "ce_loss": 0.1802901327610016, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "distill_loss": 0.1890738308429718, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "ref_ce_loss": 0.12323080003261566, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "loss": 0.6056669354438782, + "step": 4330 + }, + { + "ce_loss": 0.13862594962120056, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "distill_loss": 0.15863680839538574, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "ref_ce_loss": 0.20575878024101257, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "loss": 0.531682014465332, + "step": 4330 + }, + { + "ce_loss": 0.20502084493637085, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "distill_loss": 0.17332421243190765, + "epoch": 1.4442961974649766, + "step": 4330 + }, + { + "epoch": 1.4442961974649766, + "ref_ce_loss": 0.1205807700753212, + "step": 4330 + }, + { + "epoch": 1.447631754503002, + "loss": 0.6714, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "grad_norm": 3.1532366275787354, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "learning_rate": 0.00028243114834949673, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "loss": 0.8360211849212646, + "step": 4340 + }, + { + "ce_loss": 0.3114469051361084, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "distill_loss": 0.14694617688655853, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "ref_ce_loss": 0.15568295121192932, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "loss": 0.6691051721572876, + "step": 4340 + }, + { + "ce_loss": 0.3203543722629547, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "distill_loss": 0.13674457371234894, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "ref_ce_loss": 0.15188376605510712, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "loss": 0.6616179347038269, + "step": 4340 + }, + { + "ce_loss": 0.26779434084892273, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "distill_loss": 0.11278953403234482, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "ref_ce_loss": 0.21746957302093506, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "loss": 0.48690155148506165, + "step": 4340 + }, + { + "ce_loss": 0.1967121809720993, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "distill_loss": 0.10658518970012665, + "epoch": 1.447631754503002, + "step": 4340 + }, + { + "epoch": 1.447631754503002, + "ref_ce_loss": 0.12141291052103043, + "step": 4340 + }, + { + "epoch": 1.4509673115410273, + "loss": 0.6241, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "grad_norm": 2.640625476837158, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "learning_rate": 0.0002823359028907041, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "loss": 0.7299114465713501, + "step": 4350 + }, + { + "ce_loss": 0.22083674371242523, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "distill_loss": 0.19083622097969055, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "ref_ce_loss": 0.12957577407360077, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "loss": 0.5755342841148376, + "step": 4350 + }, + { + "ce_loss": 0.25721633434295654, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "distill_loss": 0.1757301241159439, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "ref_ce_loss": 0.14243139326572418, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "loss": 0.7849111557006836, + "step": 4350 + }, + { + "ce_loss": 0.2349243015050888, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "distill_loss": 0.17022936046123505, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "ref_ce_loss": 0.15478086471557617, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "loss": 0.49518242478370667, + "step": 4350 + }, + { + "ce_loss": 0.12407956272363663, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "distill_loss": 0.18435871601104736, + "epoch": 1.4509673115410273, + "step": 4350 + }, + { + "epoch": 1.4509673115410273, + "ref_ce_loss": 0.08474914729595184, + "step": 4350 + }, + { + "epoch": 1.4543028685790527, + "loss": 0.6729, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "grad_norm": 3.134436845779419, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "learning_rate": 0.00028224041610342684, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "loss": 0.6206045746803284, + "step": 4360 + }, + { + "ce_loss": 0.19397678971290588, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "distill_loss": 0.17355449497699738, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "ref_ce_loss": 0.1407630443572998, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "loss": 0.5650532245635986, + "step": 4360 + }, + { + "ce_loss": 0.1520107090473175, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "distill_loss": 0.20400208234786987, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "ref_ce_loss": 0.12605783343315125, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "loss": 0.7112383246421814, + "step": 4360 + }, + { + "ce_loss": 0.2116408795118332, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "distill_loss": 0.253677099943161, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "ref_ce_loss": 0.15583555400371552, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "loss": 0.557046115398407, + "step": 4360 + }, + { + "ce_loss": 0.21234719455242157, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "distill_loss": 0.2124667763710022, + "epoch": 1.4543028685790527, + "step": 4360 + }, + { + "epoch": 1.4543028685790527, + "ref_ce_loss": 0.1319587528705597, + "step": 4360 + }, + { + "epoch": 1.457638425617078, + "loss": 0.7405, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "grad_norm": 2.821734666824341, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "learning_rate": 0.0002821446881617952, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "loss": 0.4984533488750458, + "step": 4370 + }, + { + "ce_loss": 0.16452477872371674, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "distill_loss": 0.17812155187129974, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "ref_ce_loss": 0.12596623599529266, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "loss": 0.5950712561607361, + "step": 4370 + }, + { + "ce_loss": 0.1918494552373886, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "distill_loss": 0.24175810813903809, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "ref_ce_loss": 0.1162458136677742, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "loss": 0.5311948657035828, + "step": 4370 + }, + { + "ce_loss": 0.1166113018989563, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "distill_loss": 0.1816932111978531, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "ref_ce_loss": 0.117859847843647, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "loss": 1.321441650390625, + "step": 4370 + }, + { + "ce_loss": 0.1763760894536972, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "distill_loss": 0.16680777072906494, + "epoch": 1.457638425617078, + "step": 4370 + }, + { + "epoch": 1.457638425617078, + "ref_ce_loss": 0.1586695909500122, + "step": 4370 + }, + { + "epoch": 1.4609739826551034, + "loss": 0.6801, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "grad_norm": 2.483330726623535, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "learning_rate": 0.0002820487192403792, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "loss": 0.8458122611045837, + "step": 4380 + }, + { + "ce_loss": 0.18738530576229095, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "distill_loss": 0.14879602193832397, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "ref_ce_loss": 0.1313273310661316, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "loss": 0.9360214471817017, + "step": 4380 + }, + { + "ce_loss": 0.09423506259918213, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "distill_loss": 0.1366136372089386, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "ref_ce_loss": 0.11418969184160233, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "loss": 0.4507545232772827, + "step": 4380 + }, + { + "ce_loss": 0.16262701153755188, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "distill_loss": 0.13440194725990295, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "ref_ce_loss": 0.1535230129957199, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "loss": 0.8710918426513672, + "step": 4380 + }, + { + "ce_loss": 0.18411631882190704, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "distill_loss": 0.14756955206394196, + "epoch": 1.4609739826551034, + "step": 4380 + }, + { + "epoch": 1.4609739826551034, + "ref_ce_loss": 0.15467248857021332, + "step": 4380 + }, + { + "epoch": 1.4643095396931287, + "loss": 0.6259, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "grad_norm": 2.173874855041504, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "learning_rate": 0.0002819525095141883, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "loss": 0.42656779289245605, + "step": 4390 + }, + { + "ce_loss": 0.12446770817041397, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "distill_loss": 0.1477583944797516, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "ref_ce_loss": 0.08694154024124146, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "loss": 0.5260968804359436, + "step": 4390 + }, + { + "ce_loss": 0.18589136004447937, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "distill_loss": 0.16689875721931458, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "ref_ce_loss": 0.17246730625629425, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "loss": 0.5359268188476562, + "step": 4390 + }, + { + "ce_loss": 0.11834832280874252, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "distill_loss": 0.1549883782863617, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "ref_ce_loss": 0.1455782651901245, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "loss": 0.4305720925331116, + "step": 4390 + }, + { + "ce_loss": 0.16029563546180725, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "distill_loss": 0.15057703852653503, + "epoch": 1.4643095396931287, + "step": 4390 + }, + { + "epoch": 1.4643095396931287, + "ref_ce_loss": 0.1186305582523346, + "step": 4390 + }, + { + "epoch": 1.467645096731154, + "loss": 0.6841, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "grad_norm": 4.759480953216553, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "learning_rate": 0.000281856059158671, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "loss": 0.7277843952178955, + "step": 4400 + }, + { + "ce_loss": 0.26591551303863525, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "distill_loss": 0.1408987045288086, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "ref_ce_loss": 0.15926964581012726, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "loss": 0.5168325901031494, + "step": 4400 + }, + { + "ce_loss": 0.23409435153007507, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "distill_loss": 0.10548853874206543, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "ref_ce_loss": 0.175302654504776, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "loss": 0.7794098854064941, + "step": 4400 + }, + { + "ce_loss": 0.19064731895923615, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "distill_loss": 0.1546446979045868, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "ref_ce_loss": 0.1138349249958992, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "loss": 0.7018890380859375, + "step": 4400 + }, + { + "ce_loss": 0.205464169383049, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "distill_loss": 0.14657650887966156, + "epoch": 1.467645096731154, + "step": 4400 + }, + { + "epoch": 1.467645096731154, + "ref_ce_loss": 0.1572255939245224, + "step": 4400 + }, + { + "epoch": 1.4709806537691794, + "loss": 0.6051, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "grad_norm": 2.3284032344818115, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "learning_rate": 0.0002817593683497148, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "loss": 0.4930034279823303, + "step": 4410 + }, + { + "ce_loss": 0.15108412504196167, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "distill_loss": 0.10944245755672455, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "ref_ce_loss": 0.14975978434085846, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "loss": 0.7254599928855896, + "step": 4410 + }, + { + "ce_loss": 0.21982063353061676, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "distill_loss": 0.10534757375717163, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "ref_ce_loss": 0.20826774835586548, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "loss": 0.5473403930664062, + "step": 4410 + }, + { + "ce_loss": 0.1677827686071396, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "distill_loss": 0.12926937639713287, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "ref_ce_loss": 0.07822735607624054, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "loss": 1.0237935781478882, + "step": 4410 + }, + { + "ce_loss": 0.198973149061203, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "distill_loss": 0.12356914579868317, + "epoch": 1.4709806537691794, + "step": 4410 + }, + { + "epoch": 1.4709806537691794, + "ref_ce_loss": 0.1262451410293579, + "step": 4410 + }, + { + "epoch": 1.4743162108072048, + "loss": 0.6375, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "grad_norm": 5.017458915710449, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "learning_rate": 0.00028166243726364555, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "loss": 0.5311963558197021, + "step": 4420 + }, + { + "ce_loss": 0.1883728802204132, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "distill_loss": 0.10420016944408417, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "ref_ce_loss": 0.09754882007837296, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "loss": 0.5212543606758118, + "step": 4420 + }, + { + "ce_loss": 0.2502850592136383, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "distill_loss": 0.13868878781795502, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "ref_ce_loss": 0.1322166621685028, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "loss": 0.7725924849510193, + "step": 4420 + }, + { + "ce_loss": 0.211654394865036, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "distill_loss": 0.12484170496463776, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "ref_ce_loss": 0.10742399841547012, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "loss": 0.449934184551239, + "step": 4420 + }, + { + "ce_loss": 0.21506257355213165, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "distill_loss": 0.11620468646287918, + "epoch": 1.4743162108072048, + "step": 4420 + }, + { + "epoch": 1.4743162108072048, + "ref_ce_loss": 0.09754442423582077, + "step": 4420 + }, + { + "epoch": 1.4776517678452301, + "loss": 0.6099, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "grad_norm": 2.544800281524658, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "learning_rate": 0.0002815652660772273, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "loss": 0.5477819442749023, + "step": 4430 + }, + { + "ce_loss": 0.24007096886634827, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "distill_loss": 0.08831313997507095, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "ref_ce_loss": 0.11866340786218643, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "loss": 0.46992701292037964, + "step": 4430 + }, + { + "ce_loss": 0.19611972570419312, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "distill_loss": 0.09721183776855469, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "ref_ce_loss": 0.11860304325819016, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "loss": 0.48709186911582947, + "step": 4430 + }, + { + "ce_loss": 0.22293464839458466, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "distill_loss": 0.07582428306341171, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "ref_ce_loss": 0.11847430467605591, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "loss": 0.9923147559165955, + "step": 4430 + }, + { + "ce_loss": 0.3206259310245514, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "distill_loss": 0.10310949385166168, + "epoch": 1.4776517678452301, + "step": 4430 + }, + { + "epoch": 1.4776517678452301, + "ref_ce_loss": 0.25434812903404236, + "step": 4430 + }, + { + "epoch": 1.4809873248832555, + "loss": 0.6815, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "grad_norm": 3.2926695346832275, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "learning_rate": 0.000281467854967662, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "loss": 0.7626287937164307, + "step": 4440 + }, + { + "ce_loss": 0.09362614899873734, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "distill_loss": 0.06907413899898529, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "ref_ce_loss": 0.0958833321928978, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "loss": 0.49843913316726685, + "step": 4440 + }, + { + "ce_loss": 0.2558717429637909, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "distill_loss": 0.09227704256772995, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "ref_ce_loss": 0.15015363693237305, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "loss": 0.6889381408691406, + "step": 4440 + }, + { + "ce_loss": 0.18226727843284607, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "distill_loss": 0.09716913849115372, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "ref_ce_loss": 0.1393125206232071, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "loss": 0.5315539240837097, + "step": 4440 + }, + { + "ce_loss": 0.2396669238805771, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "distill_loss": 0.09958633780479431, + "epoch": 1.4809873248832555, + "step": 4440 + }, + { + "epoch": 1.4809873248832555, + "ref_ce_loss": 0.12504830956459045, + "step": 4440 + }, + { + "epoch": 1.4843228819212808, + "loss": 0.5759, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "grad_norm": 1.7213834524154663, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "learning_rate": 0.0002813702041125891, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "loss": 0.5091803073883057, + "step": 4450 + }, + { + "ce_loss": 0.20656217634677887, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "distill_loss": 0.09335896372795105, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "ref_ce_loss": 0.13938568532466888, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "loss": 0.5872694253921509, + "step": 4450 + }, + { + "ce_loss": 0.2452130764722824, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "distill_loss": 0.10319434106349945, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "ref_ce_loss": 0.1681223064661026, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "loss": 0.6491241455078125, + "step": 4450 + }, + { + "ce_loss": 0.2033555805683136, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "distill_loss": 0.10134235769510269, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "ref_ce_loss": 0.09184518456459045, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "loss": 1.2535475492477417, + "step": 4450 + }, + { + "ce_loss": 0.2783401310443878, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "distill_loss": 0.09875951707363129, + "epoch": 1.4843228819212808, + "step": 4450 + }, + { + "epoch": 1.4843228819212808, + "ref_ce_loss": 0.16019049286842346, + "step": 4450 + }, + { + "epoch": 1.4876584389593062, + "loss": 0.6338, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "grad_norm": 2.5184757709503174, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "learning_rate": 0.00028127231369008525, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "loss": 0.8658751845359802, + "step": 4460 + }, + { + "ce_loss": 0.23773770034313202, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "distill_loss": 0.11125155538320541, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "ref_ce_loss": 0.12172891944646835, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "loss": 1.2557917833328247, + "step": 4460 + }, + { + "ce_loss": 0.18311569094657898, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "distill_loss": 0.0814177542924881, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "ref_ce_loss": 0.2201717346906662, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "loss": 0.4614761173725128, + "step": 4460 + }, + { + "ce_loss": 0.2173597365617752, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "distill_loss": 0.08315160125494003, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "ref_ce_loss": 0.1606503278017044, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "loss": 0.82685387134552, + "step": 4460 + }, + { + "ce_loss": 0.33986949920654297, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "distill_loss": 0.12730133533477783, + "epoch": 1.4876584389593062, + "step": 4460 + }, + { + "epoch": 1.4876584389593062, + "ref_ce_loss": 0.1949513703584671, + "step": 4460 + }, + { + "epoch": 1.4909939959973315, + "loss": 0.6531, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "grad_norm": 4.018879413604736, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "learning_rate": 0.00028117418387866384, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "loss": 0.4994458556175232, + "step": 4470 + }, + { + "ce_loss": 0.1709452122449875, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "distill_loss": 0.0967760682106018, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "ref_ce_loss": 0.1585451066493988, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "loss": 0.7291620969772339, + "step": 4470 + }, + { + "ce_loss": 0.1918286234140396, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "distill_loss": 0.13846111297607422, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "ref_ce_loss": 0.15093106031417847, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "loss": 0.5087245106697083, + "step": 4470 + }, + { + "ce_loss": 0.2672545909881592, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "distill_loss": 0.1381414383649826, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "ref_ce_loss": 0.10308218747377396, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "loss": 0.6515514850616455, + "step": 4470 + }, + { + "ce_loss": 0.15585504472255707, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "distill_loss": 0.10814927518367767, + "epoch": 1.4909939959973315, + "step": 4470 + }, + { + "epoch": 1.4909939959973315, + "ref_ce_loss": 0.12409359216690063, + "step": 4470 + }, + { + "epoch": 1.4943295530353569, + "loss": 0.5677, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "grad_norm": 2.1926746368408203, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "learning_rate": 0.00028107581485727507, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "loss": 0.43809303641319275, + "step": 4480 + }, + { + "ce_loss": 0.16458266973495483, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "distill_loss": 0.1329808533191681, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "ref_ce_loss": 0.14041352272033691, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "loss": 0.442251980304718, + "step": 4480 + }, + { + "ce_loss": 0.20063066482543945, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "distill_loss": 0.12663978338241577, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "ref_ce_loss": 0.11457744985818863, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "loss": 0.544518768787384, + "step": 4480 + }, + { + "ce_loss": 0.20446360111236572, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "distill_loss": 0.12162137031555176, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "ref_ce_loss": 0.15851354598999023, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "loss": 1.210282802581787, + "step": 4480 + }, + { + "ce_loss": 0.15788322687149048, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "distill_loss": 0.12018802762031555, + "epoch": 1.4943295530353569, + "step": 4480 + }, + { + "epoch": 1.4943295530353569, + "ref_ce_loss": 0.13062524795532227, + "step": 4480 + }, + { + "epoch": 1.4976651100733822, + "loss": 0.6311, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "grad_norm": 3.151811122894287, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "learning_rate": 0.0002809772068053052, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "loss": 1.0704823732376099, + "step": 4490 + }, + { + "ce_loss": 0.2743171453475952, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "distill_loss": 0.13374269008636475, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "ref_ce_loss": 0.09331765025854111, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "loss": 0.5207926630973816, + "step": 4490 + }, + { + "ce_loss": 0.23340195417404175, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "distill_loss": 0.11091122031211853, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "ref_ce_loss": 0.11333482712507248, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "loss": 0.5735177397727966, + "step": 4490 + }, + { + "ce_loss": 0.15200015902519226, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "distill_loss": 0.10044776648283005, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "ref_ce_loss": 0.11417778581380844, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "loss": 0.44036537408828735, + "step": 4490 + }, + { + "ce_loss": 0.18106389045715332, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "distill_loss": 0.09861503541469574, + "epoch": 1.4976651100733822, + "step": 4490 + }, + { + "epoch": 1.4976651100733822, + "ref_ce_loss": 0.1276373416185379, + "step": 4490 + }, + { + "epoch": 1.5010006671114076, + "loss": 0.6131, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "grad_norm": 2.3492748737335205, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "learning_rate": 0.0002808783599025764, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "loss": 0.5812758803367615, + "step": 4500 + }, + { + "ce_loss": 0.2145005613565445, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "distill_loss": 0.11526884138584137, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "ref_ce_loss": 0.12662525475025177, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "loss": 0.7451175451278687, + "step": 4500 + }, + { + "ce_loss": 0.18240326642990112, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "distill_loss": 0.09705304354429245, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "ref_ce_loss": 0.1561345010995865, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "loss": 0.4314347207546234, + "step": 4500 + }, + { + "ce_loss": 0.2048446536064148, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "distill_loss": 0.11508011817932129, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "ref_ce_loss": 0.11129327863454819, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "loss": 0.5526909232139587, + "step": 4500 + }, + { + "ce_loss": 0.20685704052448273, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "distill_loss": 0.13564878702163696, + "epoch": 1.5010006671114076, + "step": 4500 + }, + { + "epoch": 1.5010006671114076, + "ref_ce_loss": 0.13427762687206268, + "step": 4500 + }, + { + "epoch": 1.504336224149433, + "loss": 0.5843, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "grad_norm": 3.733797788619995, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "learning_rate": 0.00028077927432934645, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "loss": 0.6804251670837402, + "step": 4510 + }, + { + "ce_loss": 0.32606402039527893, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "distill_loss": 0.1469072699546814, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "ref_ce_loss": 0.16901257634162903, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "loss": 0.5420999526977539, + "step": 4510 + }, + { + "ce_loss": 0.23655718564987183, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "distill_loss": 0.09165996313095093, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "ref_ce_loss": 0.21362152695655823, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "loss": 0.35762321949005127, + "step": 4510 + }, + { + "ce_loss": 0.15837861597537994, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "distill_loss": 0.09633652865886688, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "ref_ce_loss": 0.1026814803481102, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "loss": 0.5618549585342407, + "step": 4510 + }, + { + "ce_loss": 0.2075524926185608, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "distill_loss": 0.10399427264928818, + "epoch": 1.504336224149433, + "step": 4510 + }, + { + "epoch": 1.504336224149433, + "ref_ce_loss": 0.12820473313331604, + "step": 4510 + }, + { + "epoch": 1.5076717811874583, + "loss": 0.5818, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "grad_norm": 3.101632595062256, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "learning_rate": 0.0002806799502663083, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "loss": 0.5161787867546082, + "step": 4520 + }, + { + "ce_loss": 0.2295653074979782, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "distill_loss": 0.1541547179222107, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "ref_ce_loss": 0.13225367665290833, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "loss": 0.7354413270950317, + "step": 4520 + }, + { + "ce_loss": 0.4115031957626343, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "distill_loss": 0.15912646055221558, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "ref_ce_loss": 0.16471584141254425, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "loss": 1.3074798583984375, + "step": 4520 + }, + { + "ce_loss": 0.2868303060531616, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "distill_loss": 0.14963319897651672, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "ref_ce_loss": 0.14439374208450317, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "loss": 0.6702625751495361, + "step": 4520 + }, + { + "ce_loss": 0.2194564938545227, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "distill_loss": 0.13500796258449554, + "epoch": 1.5076717811874583, + "step": 4520 + }, + { + "epoch": 1.5076717811874583, + "ref_ce_loss": 0.14340227842330933, + "step": 4520 + }, + { + "epoch": 1.5110073382254836, + "loss": 0.7013, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "grad_norm": 4.905083656311035, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "learning_rate": 0.00028058038789458993, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "loss": 0.6859695315361023, + "step": 4530 + }, + { + "ce_loss": 0.19414359331130981, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "distill_loss": 0.12613198161125183, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "ref_ce_loss": 0.11808284372091293, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "loss": 0.5683508515357971, + "step": 4530 + }, + { + "ce_loss": 0.237142413854599, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "distill_loss": 0.15945348143577576, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "ref_ce_loss": 0.16993550956249237, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "loss": 0.4896041452884674, + "step": 4530 + }, + { + "ce_loss": 0.1853276491165161, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "distill_loss": 0.14374326169490814, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "ref_ce_loss": 0.1603091061115265, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "loss": 0.8239613771438599, + "step": 4530 + }, + { + "ce_loss": 0.23428012430667877, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "distill_loss": 0.17898336052894592, + "epoch": 1.5110073382254836, + "step": 4530 + }, + { + "epoch": 1.5110073382254836, + "ref_ce_loss": 0.13119062781333923, + "step": 4530 + }, + { + "epoch": 1.514342895263509, + "loss": 0.6341, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "grad_norm": 2.772738456726074, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "learning_rate": 0.0002804805873957538, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "loss": 0.5715469717979431, + "step": 4540 + }, + { + "ce_loss": 0.18093213438987732, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "distill_loss": 0.15032875537872314, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "ref_ce_loss": 0.15656159818172455, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "loss": 0.45982062816619873, + "step": 4540 + }, + { + "ce_loss": 0.1117219477891922, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "distill_loss": 0.1376478224992752, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "ref_ce_loss": 0.08686988800764084, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "loss": 0.4616468548774719, + "step": 4540 + }, + { + "ce_loss": 0.14209826290607452, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "distill_loss": 0.13099880516529083, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "ref_ce_loss": 0.09180080890655518, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "loss": 0.6827181577682495, + "step": 4540 + }, + { + "ce_loss": 0.24990394711494446, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "distill_loss": 0.15088555216789246, + "epoch": 1.514342895263509, + "step": 4540 + }, + { + "epoch": 1.514342895263509, + "ref_ce_loss": 0.09840228408575058, + "step": 4540 + }, + { + "epoch": 1.5176784523015343, + "loss": 0.6297, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "grad_norm": 2.13785719871521, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "learning_rate": 0.0002803805489517966, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "loss": 0.4930049777030945, + "step": 4550 + }, + { + "ce_loss": 0.17963160574436188, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "distill_loss": 0.1052674651145935, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "ref_ce_loss": 0.1195664182305336, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "loss": 0.6021493077278137, + "step": 4550 + }, + { + "ce_loss": 0.1968289315700531, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "distill_loss": 0.1056264191865921, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "ref_ce_loss": 0.15359090268611908, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "loss": 0.7040922045707703, + "step": 4550 + }, + { + "ce_loss": 0.12671718001365662, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "distill_loss": 0.07407093793153763, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "ref_ce_loss": 0.13461308181285858, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "loss": 0.4749680161476135, + "step": 4550 + }, + { + "ce_loss": 0.19816166162490845, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "distill_loss": 0.09519590437412262, + "epoch": 1.5176784523015343, + "step": 4550 + }, + { + "epoch": 1.5176784523015343, + "ref_ce_loss": 0.11878227442502975, + "step": 4550 + }, + { + "epoch": 1.5210140093395597, + "loss": 0.589, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "grad_norm": 2.2736968994140625, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "learning_rate": 0.0002802802727451491, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "loss": 0.8191660642623901, + "step": 4560 + }, + { + "ce_loss": 0.22039499878883362, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "distill_loss": 0.09533695131540298, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "ref_ce_loss": 0.17217586934566498, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "loss": 0.4060024917125702, + "step": 4560 + }, + { + "ce_loss": 0.11546523869037628, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "distill_loss": 0.10482259094715118, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "ref_ce_loss": 0.14050264656543732, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "loss": 0.3823060989379883, + "step": 4560 + }, + { + "ce_loss": 0.127862811088562, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "distill_loss": 0.12342780083417892, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "ref_ce_loss": 0.09424252063035965, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "loss": 1.0516396760940552, + "step": 4560 + }, + { + "ce_loss": 0.30975213646888733, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "distill_loss": 0.13757649064064026, + "epoch": 1.5210140093395597, + "step": 4560 + }, + { + "epoch": 1.5210140093395597, + "ref_ce_loss": 0.19756831228733063, + "step": 4560 + }, + { + "epoch": 1.524349566377585, + "loss": 0.583, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "grad_norm": 2.270359754562378, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "learning_rate": 0.0002801797589586755, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "loss": 0.4193212687969208, + "step": 4570 + }, + { + "ce_loss": 0.14371411502361298, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "distill_loss": 0.10037150233983994, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "ref_ce_loss": 0.11395347863435745, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "loss": 0.4830290973186493, + "step": 4570 + }, + { + "ce_loss": 0.221247598528862, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "distill_loss": 0.0934707373380661, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "ref_ce_loss": 0.12097950279712677, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "loss": 0.5064443349838257, + "step": 4570 + }, + { + "ce_loss": 0.16764579713344574, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "distill_loss": 0.10355889797210693, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "ref_ce_loss": 0.1548304706811905, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "loss": 0.9362093210220337, + "step": 4570 + }, + { + "ce_loss": 0.29452425241470337, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "distill_loss": 0.13609839975833893, + "epoch": 1.524349566377585, + "step": 4570 + }, + { + "epoch": 1.524349566377585, + "ref_ce_loss": 0.13039226830005646, + "step": 4570 + }, + { + "epoch": 1.5276851234156104, + "loss": 0.6303, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "grad_norm": 2.9846997261047363, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "learning_rate": 0.00028007900777567325, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "loss": 1.1162803173065186, + "step": 4580 + }, + { + "ce_loss": 0.3441166579723358, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "distill_loss": 0.13471393287181854, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "ref_ce_loss": 0.14935973286628723, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "loss": 0.4999135136604309, + "step": 4580 + }, + { + "ce_loss": 0.19262626767158508, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "distill_loss": 0.13174332678318024, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "ref_ce_loss": 0.11405842751264572, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "loss": 0.5764414072036743, + "step": 4580 + }, + { + "ce_loss": 0.27998676896095276, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "distill_loss": 0.1316446214914322, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "ref_ce_loss": 0.16455060243606567, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "loss": 0.822860836982727, + "step": 4580 + }, + { + "ce_loss": 0.30782365798950195, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "distill_loss": 0.09736745804548264, + "epoch": 1.5276851234156104, + "step": 4580 + }, + { + "epoch": 1.5276851234156104, + "ref_ce_loss": 0.22478896379470825, + "step": 4580 + }, + { + "epoch": 1.5310206804536357, + "loss": 0.6253, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "grad_norm": 2.1859419345855713, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "learning_rate": 0.0002799780193798728, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "loss": 0.46541544795036316, + "step": 4590 + }, + { + "ce_loss": 0.17891079187393188, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "distill_loss": 0.1077502891421318, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "ref_ce_loss": 0.1313294768333435, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "loss": 0.5507728457450867, + "step": 4590 + }, + { + "ce_loss": 0.2049635499715805, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "distill_loss": 0.10741844028234482, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "ref_ce_loss": 0.192083477973938, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "loss": 0.22651554644107819, + "step": 4590 + }, + { + "ce_loss": 0.0875018909573555, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "distill_loss": 0.07626447826623917, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "ref_ce_loss": 0.06258339434862137, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "loss": 0.5488446354866028, + "step": 4590 + }, + { + "ce_loss": 0.25994938611984253, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "distill_loss": 0.114890918135643, + "epoch": 1.5310206804536357, + "step": 4590 + }, + { + "epoch": 1.5310206804536357, + "ref_ce_loss": 0.17376874387264252, + "step": 4590 + }, + { + "epoch": 1.534356237491661, + "loss": 0.702, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "grad_norm": 5.405256271362305, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "learning_rate": 0.0002798767939554372, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "loss": 0.5550327897071838, + "step": 4600 + }, + { + "ce_loss": 0.22164778411388397, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "distill_loss": 0.10468171536922455, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "ref_ce_loss": 0.14506924152374268, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "loss": 0.38334205746650696, + "step": 4600 + }, + { + "ce_loss": 0.1728275716304779, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "distill_loss": 0.09280980378389359, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "ref_ce_loss": 0.1168849989771843, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "loss": 0.43203508853912354, + "step": 4600 + }, + { + "ce_loss": 0.1975594013929367, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "distill_loss": 0.08630432188510895, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "ref_ce_loss": 0.11872578412294388, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "loss": 0.5808179378509521, + "step": 4600 + }, + { + "ce_loss": 0.3010261654853821, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "distill_loss": 0.10343575477600098, + "epoch": 1.534356237491661, + "step": 4600 + }, + { + "epoch": 1.534356237491661, + "ref_ce_loss": 0.1339024305343628, + "step": 4600 + }, + { + "epoch": 1.5376917945296864, + "loss": 0.5234, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "grad_norm": 2.9711384773254395, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "learning_rate": 0.00027977533168696154, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "loss": 0.45412778854370117, + "step": 4610 + }, + { + "ce_loss": 0.17941448092460632, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "distill_loss": 0.1014786809682846, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "ref_ce_loss": 0.10625439882278442, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "loss": 0.4874230623245239, + "step": 4610 + }, + { + "ce_loss": 0.12424776703119278, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "distill_loss": 0.0906376838684082, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "ref_ce_loss": 0.12060170620679855, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "loss": 0.4538404643535614, + "step": 4610 + }, + { + "ce_loss": 0.16458889842033386, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "distill_loss": 0.09361595660448074, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "ref_ce_loss": 0.13653051853179932, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "loss": 0.681818962097168, + "step": 4610 + }, + { + "ce_loss": 0.1615886092185974, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "distill_loss": 0.1266528218984604, + "epoch": 1.5376917945296864, + "step": 4610 + }, + { + "epoch": 1.5376917945296864, + "ref_ce_loss": 0.12056848406791687, + "step": 4610 + }, + { + "epoch": 1.5410273515677118, + "loss": 0.582, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "grad_norm": 2.548248052597046, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "learning_rate": 0.0002796736327594731, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "loss": 0.5229469537734985, + "step": 4620 + }, + { + "ce_loss": 0.23538225889205933, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "distill_loss": 0.11060227453708649, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "ref_ce_loss": 0.12148841470479965, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "loss": 0.6585526466369629, + "step": 4620 + }, + { + "ce_loss": 0.27497315406799316, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "distill_loss": 0.12979480624198914, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "ref_ce_loss": 0.14562958478927612, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "loss": 0.9131790399551392, + "step": 4620 + }, + { + "ce_loss": 0.2843893766403198, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "distill_loss": 0.11908374726772308, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "ref_ce_loss": 0.1165936067700386, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "loss": 0.46176856756210327, + "step": 4620 + }, + { + "ce_loss": 0.11503535509109497, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "distill_loss": 0.08880730718374252, + "epoch": 1.5410273515677118, + "step": 4620 + }, + { + "epoch": 1.5410273515677118, + "ref_ce_loss": 0.12023654580116272, + "step": 4620 + }, + { + "epoch": 1.544362908605737, + "loss": 0.6127, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "grad_norm": 2.780367612838745, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "learning_rate": 0.00027957169735843066, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "loss": 0.544620931148529, + "step": 4630 + }, + { + "ce_loss": 0.14425063133239746, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "distill_loss": 0.11910367757081985, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "ref_ce_loss": 0.08964873105287552, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "loss": 0.4384223520755768, + "step": 4630 + }, + { + "ce_loss": 0.2096438705921173, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "distill_loss": 0.1158871054649353, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "ref_ce_loss": 0.11236631125211716, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "loss": 0.34739792346954346, + "step": 4630 + }, + { + "ce_loss": 0.15089234709739685, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "distill_loss": 0.11569201201200485, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "ref_ce_loss": 0.07834845036268234, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "loss": 0.6551713943481445, + "step": 4630 + }, + { + "ce_loss": 0.14992360770702362, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "distill_loss": 0.10660592466592789, + "epoch": 1.544362908605737, + "step": 4630 + }, + { + "epoch": 1.544362908605737, + "ref_ce_loss": 0.10331352800130844, + "step": 4630 + }, + { + "epoch": 1.5476984656437625, + "loss": 0.5794, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "grad_norm": 1.8713886737823486, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "learning_rate": 0.00027946952566972397, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "loss": 0.416147381067276, + "step": 4640 + }, + { + "ce_loss": 0.17810320854187012, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "distill_loss": 0.07940616458654404, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "ref_ce_loss": 0.15827806293964386, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "loss": 0.4797060191631317, + "step": 4640 + }, + { + "ce_loss": 0.16103573143482208, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "distill_loss": 0.10082913935184479, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "ref_ce_loss": 0.15501374006271362, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "loss": 0.5062236189842224, + "step": 4640 + }, + { + "ce_loss": 0.2246425896883011, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "distill_loss": 0.11969659477472305, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "ref_ce_loss": 0.12336175888776779, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "loss": 0.3968091905117035, + "step": 4640 + }, + { + "ce_loss": 0.17112435400485992, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "distill_loss": 0.10536085069179535, + "epoch": 1.5476984656437625, + "step": 4640 + }, + { + "epoch": 1.5476984656437625, + "ref_ce_loss": 0.12002520263195038, + "step": 4640 + }, + { + "epoch": 1.5510340226817878, + "loss": 0.5791, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "grad_norm": 4.521932601928711, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "learning_rate": 0.0002793671178796741, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "loss": 0.5670678615570068, + "step": 4650 + }, + { + "ce_loss": 0.2741747796535492, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "distill_loss": 0.10975248366594315, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "ref_ce_loss": 0.12581507861614227, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "loss": 0.31926068663597107, + "step": 4650 + }, + { + "ce_loss": 0.08233984559774399, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "distill_loss": 0.07135706394910812, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "ref_ce_loss": 0.0867433100938797, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "loss": 0.834353506565094, + "step": 4650 + }, + { + "ce_loss": 0.20248062908649445, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "distill_loss": 0.10472960025072098, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "ref_ce_loss": 0.13779492676258087, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "loss": 0.7164924740791321, + "step": 4650 + }, + { + "ce_loss": 0.24499934911727905, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "distill_loss": 0.10008464753627777, + "epoch": 1.5510340226817878, + "step": 4650 + }, + { + "epoch": 1.5510340226817878, + "ref_ce_loss": 0.18830597400665283, + "step": 4650 + }, + { + "epoch": 1.5543695797198132, + "loss": 0.5762, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "grad_norm": 1.9185824394226074, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "learning_rate": 0.0002792644741750324, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "loss": 0.3904956877231598, + "step": 4660 + }, + { + "ce_loss": 0.17337539792060852, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "distill_loss": 0.10445573925971985, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "ref_ce_loss": 0.07155634462833405, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "loss": 0.6196936368942261, + "step": 4660 + }, + { + "ce_loss": 0.14578856527805328, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "distill_loss": 0.09393522143363953, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "ref_ce_loss": 0.10801822692155838, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "loss": 0.9515413045883179, + "step": 4660 + }, + { + "ce_loss": 0.2024691253900528, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "distill_loss": 0.10020405799150467, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "ref_ce_loss": 0.1126125305891037, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "loss": 0.7406129240989685, + "step": 4660 + }, + { + "ce_loss": 0.3020225763320923, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "distill_loss": 0.10811947286128998, + "epoch": 1.5543695797198132, + "step": 4660 + }, + { + "epoch": 1.5543695797198132, + "ref_ce_loss": 0.19355078041553497, + "step": 4660 + }, + { + "epoch": 1.5577051367578385, + "loss": 0.6206, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "grad_norm": 2.085416555404663, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "learning_rate": 0.00027916159474298044, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "loss": 0.5231167078018188, + "step": 4670 + }, + { + "ce_loss": 0.14092274010181427, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "distill_loss": 0.0953192263841629, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "ref_ce_loss": 0.13460183143615723, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "loss": 0.6252128481864929, + "step": 4670 + }, + { + "ce_loss": 0.15868394076824188, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "distill_loss": 0.0933477133512497, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "ref_ce_loss": 0.15329962968826294, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "loss": 0.6975128650665283, + "step": 4670 + }, + { + "ce_loss": 0.21315570175647736, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "distill_loss": 0.10002562403678894, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "ref_ce_loss": 0.1326710730791092, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "loss": 0.3720162510871887, + "step": 4670 + }, + { + "ce_loss": 0.15149645507335663, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "distill_loss": 0.08944686502218246, + "epoch": 1.5577051367578385, + "step": 4670 + }, + { + "epoch": 1.5577051367578385, + "ref_ce_loss": 0.12873674929141998, + "step": 4670 + }, + { + "epoch": 1.5610406937958639, + "loss": 0.59, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "grad_norm": 2.7784948348999023, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "learning_rate": 0.0002790584797711298, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "loss": 0.7926986813545227, + "step": 4680 + }, + { + "ce_loss": 0.2878677248954773, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "distill_loss": 0.16262328624725342, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "ref_ce_loss": 0.14896634221076965, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "loss": 0.7145441770553589, + "step": 4680 + }, + { + "ce_loss": 0.21132363379001617, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "distill_loss": 0.11646532267332077, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "ref_ce_loss": 0.15072284638881683, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "loss": 0.5834494233131409, + "step": 4680 + }, + { + "ce_loss": 0.22991342842578888, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "distill_loss": 0.1131071075797081, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "ref_ce_loss": 0.12597393989562988, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "loss": 0.4927375316619873, + "step": 4680 + }, + { + "ce_loss": 0.17881889641284943, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "distill_loss": 0.10333593934774399, + "epoch": 1.5610406937958639, + "step": 4680 + }, + { + "epoch": 1.5610406937958639, + "ref_ce_loss": 0.14724135398864746, + "step": 4680 + }, + { + "epoch": 1.5643762508338894, + "loss": 0.6419, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "grad_norm": 3.5349230766296387, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "learning_rate": 0.00027895512944752144, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "loss": 0.8685083389282227, + "step": 4690 + }, + { + "ce_loss": 0.23852185904979706, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "distill_loss": 0.18311463296413422, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "ref_ce_loss": 0.18137918412685394, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "loss": 0.5344275832176208, + "step": 4690 + }, + { + "ce_loss": 0.20123222470283508, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "distill_loss": 0.13716256618499756, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "ref_ce_loss": 0.14230753481388092, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "loss": 0.738693118095398, + "step": 4690 + }, + { + "ce_loss": 0.28084349632263184, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "distill_loss": 0.15618476271629333, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "ref_ce_loss": 0.17222802340984344, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "loss": 0.5586490035057068, + "step": 4690 + }, + { + "ce_loss": 0.21215829253196716, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "distill_loss": 0.13161195814609528, + "epoch": 1.5643762508338894, + "step": 4690 + }, + { + "epoch": 1.5643762508338894, + "ref_ce_loss": 0.18186885118484497, + "step": 4690 + }, + { + "epoch": 1.5677118078719148, + "loss": 0.6155, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "grad_norm": 2.473024845123291, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "learning_rate": 0.0002788515439606256, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "loss": 0.37272197008132935, + "step": 4700 + }, + { + "ce_loss": 0.1430395096540451, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "distill_loss": 0.1030561700463295, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "ref_ce_loss": 0.09838125109672546, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "loss": 0.6110117435455322, + "step": 4700 + }, + { + "ce_loss": 0.23183700442314148, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "distill_loss": 0.13275600969791412, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "ref_ce_loss": 0.16894279420375824, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "loss": 0.42745649814605713, + "step": 4700 + }, + { + "ce_loss": 0.16604764759540558, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "distill_loss": 0.09421001374721527, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "ref_ce_loss": 0.10031646490097046, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "loss": 0.6442577838897705, + "step": 4700 + }, + { + "ce_loss": 0.2546035349369049, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "distill_loss": 0.14264395833015442, + "epoch": 1.5677118078719148, + "step": 4700 + }, + { + "epoch": 1.5677118078719148, + "ref_ce_loss": 0.17145292460918427, + "step": 4700 + }, + { + "epoch": 1.5710473649099401, + "loss": 0.6352, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "grad_norm": 2.054325819015503, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "learning_rate": 0.0002787477234993414, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "loss": 0.8072826862335205, + "step": 4710 + }, + { + "ce_loss": 0.2330678254365921, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "distill_loss": 0.11896157264709473, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "ref_ce_loss": 0.15135934948921204, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "loss": 0.5528466105461121, + "step": 4710 + }, + { + "ce_loss": 0.19632616639137268, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "distill_loss": 0.139164000749588, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "ref_ce_loss": 0.09339972585439682, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "loss": 0.8057548403739929, + "step": 4710 + }, + { + "ce_loss": 0.25221431255340576, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "distill_loss": 0.12725681066513062, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "ref_ce_loss": 0.14530764520168304, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "loss": 0.5259864926338196, + "step": 4710 + }, + { + "ce_loss": 0.18317881226539612, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "distill_loss": 0.11473749577999115, + "epoch": 1.5710473649099401, + "step": 4710 + }, + { + "epoch": 1.5710473649099401, + "ref_ce_loss": 0.14014258980751038, + "step": 4710 + }, + { + "epoch": 1.5743829219479655, + "loss": 0.5894, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "grad_norm": 2.167114019393921, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "learning_rate": 0.00027864366825299636, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "loss": 0.48211607336997986, + "step": 4720 + }, + { + "ce_loss": 0.19091051816940308, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "distill_loss": 0.1282716691493988, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "ref_ce_loss": 0.09083818644285202, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "loss": 0.47632259130477905, + "step": 4720 + }, + { + "ce_loss": 0.23266349732875824, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "distill_loss": 0.10076127201318741, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "ref_ce_loss": 0.1049838587641716, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "loss": 0.542456865310669, + "step": 4720 + }, + { + "ce_loss": 0.19646567106246948, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "distill_loss": 0.13279792666435242, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "ref_ce_loss": 0.12299693375825882, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "loss": 0.44261011481285095, + "step": 4720 + }, + { + "ce_loss": 0.20179417729377747, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "distill_loss": 0.09362678974866867, + "epoch": 1.5743829219479655, + "step": 4720 + }, + { + "epoch": 1.5743829219479655, + "ref_ce_loss": 0.10843309015035629, + "step": 4720 + }, + { + "epoch": 1.5777184789859908, + "loss": 0.6023, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "grad_norm": 1.6021579504013062, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "learning_rate": 0.0002785393784113462, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "loss": 0.7584335803985596, + "step": 4730 + }, + { + "ce_loss": 0.29199421405792236, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "distill_loss": 0.12377098947763443, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "ref_ce_loss": 0.15207377076148987, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "loss": 0.4357783794403076, + "step": 4730 + }, + { + "ce_loss": 0.15077053010463715, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "distill_loss": 0.08500176668167114, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "ref_ce_loss": 0.11844529956579208, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "loss": 0.5543081164360046, + "step": 4730 + }, + { + "ce_loss": 0.2360016107559204, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "distill_loss": 0.11848665773868561, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "ref_ce_loss": 0.15166281163692474, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "loss": 0.992057204246521, + "step": 4730 + }, + { + "ce_loss": 0.28633835911750793, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "distill_loss": 0.11764726042747498, + "epoch": 1.5777184789859908, + "step": 4730 + }, + { + "epoch": 1.5777184789859908, + "ref_ce_loss": 0.1075211763381958, + "step": 4730 + }, + { + "epoch": 1.5810540360240162, + "loss": 0.5752, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "grad_norm": 2.260157346725464, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "learning_rate": 0.00027843485416457445, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "loss": 0.42545413970947266, + "step": 4740 + }, + { + "ce_loss": 0.1473778933286667, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "distill_loss": 0.09845662862062454, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "ref_ce_loss": 0.09697523713111877, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "loss": 0.7466808557510376, + "step": 4740 + }, + { + "ce_loss": 0.15043483674526215, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "distill_loss": 0.15371906757354736, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "ref_ce_loss": 0.11501208692789078, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "loss": 0.5339465141296387, + "step": 4740 + }, + { + "ce_loss": 0.22342433035373688, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "distill_loss": 0.12968042492866516, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "ref_ce_loss": 0.12109992653131485, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "loss": 0.4160770773887634, + "step": 4740 + }, + { + "ce_loss": 0.12246851623058319, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "distill_loss": 0.10528349876403809, + "epoch": 1.5810540360240162, + "step": 4740 + }, + { + "epoch": 1.5810540360240162, + "ref_ce_loss": 0.13350163400173187, + "step": 4740 + }, + { + "epoch": 1.5843895930620415, + "loss": 0.5754, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "grad_norm": 2.2725913524627686, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "learning_rate": 0.0002783300957032921, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "loss": 0.8523422479629517, + "step": 4750 + }, + { + "ce_loss": 0.20142604410648346, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "distill_loss": 0.12058450281620026, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "ref_ce_loss": 0.10920006781816483, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "loss": 0.8453580737113953, + "step": 4750 + }, + { + "ce_loss": 0.3348260819911957, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "distill_loss": 0.16452373564243317, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "ref_ce_loss": 0.17683695256710052, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "loss": 0.7837134003639221, + "step": 4750 + }, + { + "ce_loss": 0.2850218415260315, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "distill_loss": 0.1574116200208664, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "ref_ce_loss": 0.15330904722213745, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "loss": 1.0082786083221436, + "step": 4750 + }, + { + "ce_loss": 0.22477564215660095, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "distill_loss": 0.12516120076179504, + "epoch": 1.5843895930620415, + "step": 4750 + }, + { + "epoch": 1.5843895930620415, + "ref_ce_loss": 0.17074769735336304, + "step": 4750 + }, + { + "epoch": 1.5877251501000669, + "loss": 0.711, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "grad_norm": 3.120191812515259, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "learning_rate": 0.00027822510321853734, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "loss": 0.5238044261932373, + "step": 4760 + }, + { + "ce_loss": 0.22598464787006378, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "distill_loss": 0.12697666883468628, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "ref_ce_loss": 0.14048749208450317, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "loss": 0.6119301915168762, + "step": 4760 + }, + { + "ce_loss": 0.2498970329761505, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "distill_loss": 0.18579566478729248, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "ref_ce_loss": 0.1740613728761673, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "loss": 0.7766237258911133, + "step": 4760 + }, + { + "ce_loss": 0.18085479736328125, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "distill_loss": 0.15683233737945557, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "ref_ce_loss": 0.18002113699913025, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "loss": 0.4009416103363037, + "step": 4760 + }, + { + "ce_loss": 0.13139718770980835, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "distill_loss": 0.11593674123287201, + "epoch": 1.5877251501000669, + "step": 4760 + }, + { + "epoch": 1.5877251501000669, + "ref_ce_loss": 0.08506946265697479, + "step": 4760 + }, + { + "epoch": 1.5910607071380922, + "loss": 0.598, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "grad_norm": 1.9799612760543823, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "learning_rate": 0.000278119876901775, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "loss": 0.6736071705818176, + "step": 4770 + }, + { + "ce_loss": 0.30645042657852173, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "distill_loss": 0.13452163338661194, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "ref_ce_loss": 0.17019902169704437, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "loss": 1.46865975856781, + "step": 4770 + }, + { + "ce_loss": 0.28805285692214966, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "distill_loss": 0.1221141368150711, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "ref_ce_loss": 0.18184491991996765, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "loss": 0.5761913061141968, + "step": 4770 + }, + { + "ce_loss": 0.13550378382205963, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "distill_loss": 0.10447970777750015, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "ref_ce_loss": 0.09147114306688309, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "loss": 1.2852704524993896, + "step": 4770 + }, + { + "ce_loss": 0.22348161041736603, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "distill_loss": 0.11755059659481049, + "epoch": 1.5910607071380922, + "step": 4770 + }, + { + "epoch": 1.5910607071380922, + "ref_ce_loss": 0.1853521466255188, + "step": 4770 + }, + { + "epoch": 1.5943962641761176, + "loss": 0.5862, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "grad_norm": 2.4209163188934326, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "learning_rate": 0.0002780144169448963, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "loss": 1.2094604969024658, + "step": 4780 + }, + { + "ce_loss": 0.15398405492305756, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "distill_loss": 0.09844505041837692, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "ref_ce_loss": 0.12485992163419724, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "loss": 0.3832671046257019, + "step": 4780 + }, + { + "ce_loss": 0.12382929027080536, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "distill_loss": 0.0875585600733757, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "ref_ce_loss": 0.08526071906089783, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "loss": 0.38048553466796875, + "step": 4780 + }, + { + "ce_loss": 0.15026162564754486, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "distill_loss": 0.11404670774936676, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "ref_ce_loss": 0.11563827842473984, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "loss": 0.6271405220031738, + "step": 4780 + }, + { + "ce_loss": 0.19329474866390228, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "distill_loss": 0.09335696697235107, + "epoch": 1.5943962641761176, + "step": 4780 + }, + { + "epoch": 1.5943962641761176, + "ref_ce_loss": 0.13915462791919708, + "step": 4780 + }, + { + "epoch": 1.597731821214143, + "loss": 0.6104, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "grad_norm": 2.573589324951172, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "learning_rate": 0.0002779087235402187, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "loss": 0.740349531173706, + "step": 4790 + }, + { + "ce_loss": 0.14136841893196106, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "distill_loss": 0.08817031234502792, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "ref_ce_loss": 0.1261877566576004, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "loss": 0.4377024173736572, + "step": 4790 + }, + { + "ce_loss": 0.15925750136375427, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "distill_loss": 0.08435290306806564, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "ref_ce_loss": 0.09518688917160034, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "loss": 1.1542946100234985, + "step": 4790 + }, + { + "ce_loss": 0.2103382796049118, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "distill_loss": 0.08120667934417725, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "ref_ce_loss": 0.17455676198005676, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "loss": 0.5343722105026245, + "step": 4790 + }, + { + "ce_loss": 0.18151548504829407, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "distill_loss": 0.07225097715854645, + "epoch": 1.597731821214143, + "step": 4790 + }, + { + "epoch": 1.597731821214143, + "ref_ce_loss": 0.14874359965324402, + "step": 4790 + }, + { + "epoch": 1.6010673782521683, + "loss": 0.6157, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "grad_norm": 2.3918330669403076, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "learning_rate": 0.00027780279688048516, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "loss": 0.426487535238266, + "step": 4800 + }, + { + "ce_loss": 0.15047642588615417, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "distill_loss": 0.10461893677711487, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "ref_ce_loss": 0.17090778052806854, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "loss": 0.5501457452774048, + "step": 4800 + }, + { + "ce_loss": 0.216351717710495, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "distill_loss": 0.12492936104536057, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "ref_ce_loss": 0.15076109766960144, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "loss": 0.43778330087661743, + "step": 4800 + }, + { + "ce_loss": 0.20027485489845276, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "distill_loss": 0.10449983924627304, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "ref_ce_loss": 0.13276301324367523, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "loss": 0.3523600697517395, + "step": 4800 + }, + { + "ce_loss": 0.16868212819099426, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "distill_loss": 0.08839337527751923, + "epoch": 1.6010673782521683, + "step": 4800 + }, + { + "epoch": 1.6010673782521683, + "ref_ce_loss": 0.0947473868727684, + "step": 4800 + }, + { + "epoch": 1.6044029352901936, + "loss": 0.5668, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "grad_norm": 1.779632806777954, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "learning_rate": 0.00027769663715886426, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "loss": 0.5785982608795166, + "step": 4810 + }, + { + "ce_loss": 0.20843467116355896, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "distill_loss": 0.08618040382862091, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "ref_ce_loss": 0.13870251178741455, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "loss": 0.4519733190536499, + "step": 4810 + }, + { + "ce_loss": 0.09943678975105286, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "distill_loss": 0.08621320873498917, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "ref_ce_loss": 0.12685640156269073, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "loss": 0.4492816627025604, + "step": 4810 + }, + { + "ce_loss": 0.19865265488624573, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "distill_loss": 0.11306588351726532, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "ref_ce_loss": 0.13746190071105957, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "loss": 0.8414527177810669, + "step": 4810 + }, + { + "ce_loss": 0.29073143005371094, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "distill_loss": 0.11862733960151672, + "epoch": 1.6044029352901936, + "step": 4810 + }, + { + "epoch": 1.6044029352901936, + "ref_ce_loss": 0.1853315234184265, + "step": 4810 + }, + { + "epoch": 1.607738492328219, + "loss": 0.6134, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "grad_norm": 2.555032968521118, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "learning_rate": 0.0002775902445689494, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "loss": 0.45475345849990845, + "step": 4820 + }, + { + "ce_loss": 0.21073417365550995, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "distill_loss": 0.10298866778612137, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "ref_ce_loss": 0.0919463261961937, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "loss": 0.7756646275520325, + "step": 4820 + }, + { + "ce_loss": 0.3087059557437897, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "distill_loss": 0.12526912987232208, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "ref_ce_loss": 0.13909755647182465, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "loss": 0.48539334535598755, + "step": 4820 + }, + { + "ce_loss": 0.17870061099529266, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "distill_loss": 0.0823015347123146, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "ref_ce_loss": 0.11708899587392807, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "loss": 0.45261070132255554, + "step": 4820 + }, + { + "ce_loss": 0.21282659471035004, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "distill_loss": 0.10187379270792007, + "epoch": 1.607738492328219, + "step": 4820 + }, + { + "epoch": 1.607738492328219, + "ref_ce_loss": 0.08549916744232178, + "step": 4820 + }, + { + "epoch": 1.6110740493662443, + "loss": 0.641, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "grad_norm": 4.425067901611328, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "learning_rate": 0.0002774836193047587, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "loss": 0.4936881363391876, + "step": 4830 + }, + { + "ce_loss": 0.1533152163028717, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "distill_loss": 0.19593192636966705, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "ref_ce_loss": 0.0907311663031578, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "loss": 0.6278508305549622, + "step": 4830 + }, + { + "ce_loss": 0.17450125515460968, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "distill_loss": 0.21686002612113953, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "ref_ce_loss": 0.15096011757850647, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "loss": 0.5692883133888245, + "step": 4830 + }, + { + "ce_loss": 0.1969756931066513, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "distill_loss": 0.1700528860092163, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "ref_ce_loss": 0.152689591050148, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "loss": 0.8832573294639587, + "step": 4830 + }, + { + "ce_loss": 0.19293345510959625, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "distill_loss": 0.224935844540596, + "epoch": 1.6110740493662443, + "step": 4830 + }, + { + "epoch": 1.6110740493662443, + "ref_ce_loss": 0.17869606614112854, + "step": 4830 + }, + { + "epoch": 1.6144096064042697, + "loss": 0.7192, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "grad_norm": 5.211795806884766, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "learning_rate": 0.00027737676156073453, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "loss": 0.7288017272949219, + "step": 4840 + }, + { + "ce_loss": 0.21490071713924408, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "distill_loss": 0.2942824065685272, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "ref_ce_loss": 0.08734626322984695, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "loss": 0.6859686970710754, + "step": 4840 + }, + { + "ce_loss": 0.20365281403064728, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "distill_loss": 0.30579593777656555, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "ref_ce_loss": 0.13223841786384583, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "loss": 1.122478723526001, + "step": 4840 + }, + { + "ce_loss": 0.13811272382736206, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "distill_loss": 0.24075543880462646, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "ref_ce_loss": 0.12147624790668488, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "loss": 0.9976313710212708, + "step": 4840 + }, + { + "ce_loss": 0.39122068881988525, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "distill_loss": 0.2513085603713989, + "epoch": 1.6144096064042697, + "step": 4840 + }, + { + "epoch": 1.6144096064042697, + "ref_ce_loss": 0.2339351624250412, + "step": 4840 + }, + { + "epoch": 1.617745163442295, + "loss": 0.6723, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "grad_norm": 2.4428341388702393, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "learning_rate": 0.00027726967153174337, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "loss": 0.8324744701385498, + "step": 4850 + }, + { + "ce_loss": 0.2209848016500473, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "distill_loss": 0.1588110327720642, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "ref_ce_loss": 0.13250286877155304, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "loss": 0.43706193566322327, + "step": 4850 + }, + { + "ce_loss": 0.141639843583107, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "distill_loss": 0.15065184235572815, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "ref_ce_loss": 0.09860837459564209, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "loss": 0.49818480014801025, + "step": 4850 + }, + { + "ce_loss": 0.08093893527984619, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "distill_loss": 0.1619725227355957, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "ref_ce_loss": 0.09855761379003525, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "loss": 0.9106850624084473, + "step": 4850 + }, + { + "ce_loss": 0.29924336075782776, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "distill_loss": 0.21561862528324127, + "epoch": 1.617745163442295, + "step": 4850 + }, + { + "epoch": 1.617745163442295, + "ref_ce_loss": 0.13731172680854797, + "step": 4850 + }, + { + "epoch": 1.6210807204803204, + "loss": 0.622, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "grad_norm": 2.667039155960083, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "learning_rate": 0.00027716234941307504, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "loss": 0.4096541404724121, + "step": 4860 + }, + { + "ce_loss": 0.12734420597553253, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "distill_loss": 0.10120591521263123, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "ref_ce_loss": 0.11490931361913681, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "loss": 0.4143024682998657, + "step": 4860 + }, + { + "ce_loss": 0.14841949939727783, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "distill_loss": 0.10297094285488129, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "ref_ce_loss": 0.11480960249900818, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "loss": 0.7108901739120483, + "step": 4860 + }, + { + "ce_loss": 0.2291679084300995, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "distill_loss": 0.1301821917295456, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "ref_ce_loss": 0.1481446474790573, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "loss": 0.44128403067588806, + "step": 4860 + }, + { + "ce_loss": 0.1561107039451599, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "distill_loss": 0.10009510815143585, + "epoch": 1.6210807204803204, + "step": 4860 + }, + { + "epoch": 1.6210807204803204, + "ref_ce_loss": 0.11934979259967804, + "step": 4860 + }, + { + "epoch": 1.6244162775183457, + "loss": 0.5846, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "grad_norm": 2.787006378173828, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "learning_rate": 0.00027705479540044293, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "loss": 0.8736571669578552, + "step": 4870 + }, + { + "ce_loss": 0.12208632379770279, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "distill_loss": 0.14207608997821808, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "ref_ce_loss": 0.11297786980867386, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "loss": 0.6192541718482971, + "step": 4870 + }, + { + "ce_loss": 0.27191561460494995, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "distill_loss": 0.16424386203289032, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "ref_ce_loss": 0.18219006061553955, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "loss": 0.5637862086296082, + "step": 4870 + }, + { + "ce_loss": 0.20224043726921082, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "distill_loss": 0.14408601820468903, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "ref_ce_loss": 0.1527886986732483, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "loss": 0.5461189150810242, + "step": 4870 + }, + { + "ce_loss": 0.14812813699245453, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "distill_loss": 0.14465124905109406, + "epoch": 1.6244162775183457, + "step": 4870 + }, + { + "epoch": 1.6244162775183457, + "ref_ce_loss": 0.17705132067203522, + "step": 4870 + }, + { + "epoch": 1.627751834556371, + "loss": 0.5969, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "grad_norm": 2.4038162231445312, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "learning_rate": 0.00027694700968998296, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "loss": 0.5871032476425171, + "step": 4880 + }, + { + "ce_loss": 0.220157653093338, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "distill_loss": 0.11889912933111191, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "ref_ce_loss": 0.16518516838550568, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "loss": 0.6563812494277954, + "step": 4880 + }, + { + "ce_loss": 0.27671051025390625, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "distill_loss": 0.12580226361751556, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "ref_ce_loss": 0.19856387376785278, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "loss": 0.4133327603340149, + "step": 4880 + }, + { + "ce_loss": 0.18373562395572662, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "distill_loss": 0.10071388632059097, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "ref_ce_loss": 0.09259993582963943, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "loss": 0.5829482078552246, + "step": 4880 + }, + { + "ce_loss": 0.17557404935359955, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "distill_loss": 0.09654910862445831, + "epoch": 1.627751834556371, + "step": 4880 + }, + { + "epoch": 1.627751834556371, + "ref_ce_loss": 0.17849195003509521, + "step": 4880 + }, + { + "epoch": 1.6310873915943964, + "loss": 0.584, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "grad_norm": 2.1157753467559814, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "learning_rate": 0.00027683899247825383, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "loss": 0.6509414911270142, + "step": 4890 + }, + { + "ce_loss": 0.16850188374519348, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "distill_loss": 0.07165582478046417, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "ref_ce_loss": 0.11415962874889374, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "loss": 1.0927846431732178, + "step": 4890 + }, + { + "ce_loss": 0.2822512984275818, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "distill_loss": 0.10931877791881561, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "ref_ce_loss": 0.16365401446819305, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "loss": 0.5998976230621338, + "step": 4890 + }, + { + "ce_loss": 0.19696788489818573, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "distill_loss": 0.09859666973352432, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "ref_ce_loss": 0.1334376484155655, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "loss": 0.7270228862762451, + "step": 4890 + }, + { + "ce_loss": 0.2925046682357788, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "distill_loss": 0.12335610389709473, + "epoch": 1.6310873915943964, + "step": 4890 + }, + { + "epoch": 1.6310873915943964, + "ref_ce_loss": 0.16185958683490753, + "step": 4890 + }, + { + "epoch": 1.6344229486324218, + "loss": 0.6318, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "grad_norm": 2.7217421531677246, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "learning_rate": 0.00027673074396223637, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "loss": 0.49891188740730286, + "step": 4900 + }, + { + "ce_loss": 0.1908903568983078, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "distill_loss": 0.1197698637843132, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "ref_ce_loss": 0.14133276045322418, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "loss": 0.6270771622657776, + "step": 4900 + }, + { + "ce_loss": 0.17307709157466888, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "distill_loss": 0.08771342039108276, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "ref_ce_loss": 0.09575901180505753, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "loss": 0.5917713046073914, + "step": 4900 + }, + { + "ce_loss": 0.18899348378181458, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "distill_loss": 0.11513810604810715, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "ref_ce_loss": 0.14573843777179718, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "loss": 0.5202388763427734, + "step": 4900 + }, + { + "ce_loss": 0.20376692712306976, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "distill_loss": 0.09992238134145737, + "epoch": 1.6344229486324218, + "step": 4900 + }, + { + "epoch": 1.6344229486324218, + "ref_ce_loss": 0.1302812397480011, + "step": 4900 + }, + { + "epoch": 1.6377585056704471, + "loss": 0.6124, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "grad_norm": 3.521989345550537, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "learning_rate": 0.00027662226433933305, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "loss": 0.4377168118953705, + "step": 4910 + }, + { + "ce_loss": 0.15247930586338043, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "distill_loss": 0.09452302753925323, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "ref_ce_loss": 0.1294000744819641, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "loss": 0.3409244418144226, + "step": 4910 + }, + { + "ce_loss": 0.1711265742778778, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "distill_loss": 0.10409481078386307, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "ref_ce_loss": 0.06564094126224518, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "loss": 0.4959399998188019, + "step": 4910 + }, + { + "ce_loss": 0.1535956710577011, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "distill_loss": 0.09488362818956375, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "ref_ce_loss": 0.09364673495292664, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "loss": 0.5725511908531189, + "step": 4910 + }, + { + "ce_loss": 0.26551344990730286, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "distill_loss": 0.13034331798553467, + "epoch": 1.6377585056704471, + "step": 4910 + }, + { + "epoch": 1.6377585056704471, + "ref_ce_loss": 0.11410856246948242, + "step": 4910 + }, + { + "epoch": 1.6410940627084725, + "loss": 0.6464, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "grad_norm": 2.946782112121582, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "learning_rate": 0.000276513553807368, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "loss": 0.6583297252655029, + "step": 4920 + }, + { + "ce_loss": 0.3206038475036621, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "distill_loss": 0.1186511218547821, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "ref_ce_loss": 0.15720915794372559, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "loss": 0.4410336911678314, + "step": 4920 + }, + { + "ce_loss": 0.13522271811962128, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "distill_loss": 0.09478119015693665, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "ref_ce_loss": 0.12998971343040466, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "loss": 0.4015612304210663, + "step": 4920 + }, + { + "ce_loss": 0.18265585601329803, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "distill_loss": 0.11469544470310211, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "ref_ce_loss": 0.10402646660804749, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "loss": 0.6775842308998108, + "step": 4920 + }, + { + "ce_loss": 0.2640410363674164, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "distill_loss": 0.14958937466144562, + "epoch": 1.6410940627084725, + "step": 4920 + }, + { + "epoch": 1.6410940627084725, + "ref_ce_loss": 0.1849392205476761, + "step": 4920 + }, + { + "epoch": 1.6444296197464978, + "loss": 0.601, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "grad_norm": 2.743461847305298, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "learning_rate": 0.0002764046125645864, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "loss": 0.39398637413978577, + "step": 4930 + }, + { + "ce_loss": 0.11836259067058563, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "distill_loss": 0.07528175413608551, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "ref_ce_loss": 0.09293336421251297, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "loss": 0.5988407135009766, + "step": 4930 + }, + { + "ce_loss": 0.25346532464027405, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "distill_loss": 0.11336065828800201, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "ref_ce_loss": 0.1648983359336853, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "loss": 0.4127940237522125, + "step": 4930 + }, + { + "ce_loss": 0.12222033739089966, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "distill_loss": 0.11669386178255081, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "ref_ce_loss": 0.10548103600740433, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "loss": 1.0301282405853271, + "step": 4930 + }, + { + "ce_loss": 0.2393624484539032, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "distill_loss": 0.1533287614583969, + "epoch": 1.6444296197464978, + "step": 4930 + }, + { + "epoch": 1.6444296197464978, + "ref_ce_loss": 0.15323209762573242, + "step": 4930 + }, + { + "epoch": 1.6477651767845232, + "loss": 0.6081, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "grad_norm": 2.3398468494415283, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "learning_rate": 0.00027629544080965394, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "loss": 0.41326504945755005, + "step": 4940 + }, + { + "ce_loss": 0.15715086460113525, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "distill_loss": 0.12461084872484207, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "ref_ce_loss": 0.08338805288076401, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "loss": 0.8529278039932251, + "step": 4940 + }, + { + "ce_loss": 0.244953915476799, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "distill_loss": 0.14145401120185852, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "ref_ce_loss": 0.210275799036026, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "loss": 0.38149306178092957, + "step": 4940 + }, + { + "ce_loss": 0.11619073152542114, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "distill_loss": 0.1336188167333603, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "ref_ce_loss": 0.13149616122245789, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "loss": 0.6542447209358215, + "step": 4940 + }, + { + "ce_loss": 0.27234119176864624, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "distill_loss": 0.14006751775741577, + "epoch": 1.6477651767845232, + "step": 4940 + }, + { + "epoch": 1.6477651767845232, + "ref_ce_loss": 0.17293211817741394, + "step": 4940 + }, + { + "epoch": 1.6511007338225485, + "loss": 0.5851, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "grad_norm": 2.6936793327331543, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "learning_rate": 0.000276186038741657, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "loss": 0.5390084981918335, + "step": 4950 + }, + { + "ce_loss": 0.1600649505853653, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "distill_loss": 0.2050093561410904, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "ref_ce_loss": 0.09268899261951447, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "loss": 0.5070465803146362, + "step": 4950 + }, + { + "ce_loss": 0.183873251080513, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "distill_loss": 0.13934825360774994, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "ref_ce_loss": 0.18370971083641052, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "loss": 0.7284078001976013, + "step": 4950 + }, + { + "ce_loss": 0.23376776278018951, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "distill_loss": 0.18618497252464294, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "ref_ce_loss": 0.13847507536411285, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "loss": 0.5451200008392334, + "step": 4950 + }, + { + "ce_loss": 0.18877169489860535, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "distill_loss": 0.16713643074035645, + "epoch": 1.6511007338225485, + "step": 4950 + }, + { + "epoch": 1.6511007338225485, + "ref_ce_loss": 0.18870173394680023, + "step": 4950 + }, + { + "epoch": 1.6544362908605739, + "loss": 0.6234, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "grad_norm": 2.0953621864318848, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "learning_rate": 0.0002760764065601017, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "loss": 0.9226181507110596, + "step": 4960 + }, + { + "ce_loss": 0.2521872818470001, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "distill_loss": 0.19906756281852722, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "ref_ce_loss": 0.2884136736392975, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "loss": 0.6516121625900269, + "step": 4960 + }, + { + "ce_loss": 0.17211009562015533, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "distill_loss": 0.17549239099025726, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "ref_ce_loss": 0.1282089799642563, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "loss": 0.5280035138130188, + "step": 4960 + }, + { + "ce_loss": 0.16100002825260162, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "distill_loss": 0.1495133936405182, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "ref_ce_loss": 0.15446114540100098, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "loss": 1.0270130634307861, + "step": 4960 + }, + { + "ce_loss": 0.3464062809944153, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "distill_loss": 0.13512006402015686, + "epoch": 1.6544362908605739, + "step": 4960 + }, + { + "epoch": 1.6544362908605739, + "ref_ce_loss": 0.23012858629226685, + "step": 4960 + }, + { + "epoch": 1.6577718478985992, + "loss": 0.6433, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "grad_norm": 2.4353840351104736, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "learning_rate": 0.0002759665444649139, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "loss": 0.617945671081543, + "step": 4970 + }, + { + "ce_loss": 0.2723182141780853, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "distill_loss": 0.19296707212924957, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "ref_ce_loss": 0.15225380659103394, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "loss": 0.6833885312080383, + "step": 4970 + }, + { + "ce_loss": 0.2776934504508972, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "distill_loss": 0.23715607821941376, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "ref_ce_loss": 0.14985275268554688, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "loss": 0.6385636329650879, + "step": 4970 + }, + { + "ce_loss": 0.17333070933818817, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "distill_loss": 0.21506594121456146, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "ref_ce_loss": 0.09677013009786606, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "loss": 0.7047004699707031, + "step": 4970 + }, + { + "ce_loss": 0.20013362169265747, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "distill_loss": 0.16035297513008118, + "epoch": 1.6577718478985992, + "step": 4970 + }, + { + "epoch": 1.6577718478985992, + "ref_ce_loss": 0.10856600105762482, + "step": 4970 + }, + { + "epoch": 1.6611074049366246, + "loss": 0.6098, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "grad_norm": 1.8887118101119995, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "learning_rate": 0.00027585645265643875, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "loss": 0.46489736437797546, + "step": 4980 + }, + { + "ce_loss": 0.16724559664726257, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "distill_loss": 0.1293192207813263, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "ref_ce_loss": 0.07118848711252213, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "loss": 0.5294301509857178, + "step": 4980 + }, + { + "ce_loss": 0.21990546584129333, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "distill_loss": 0.136548712849617, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "ref_ce_loss": 0.11981360614299774, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "loss": 0.55299973487854, + "step": 4980 + }, + { + "ce_loss": 0.25989824533462524, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "distill_loss": 0.11929081380367279, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "ref_ce_loss": 0.1736879050731659, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "loss": 0.4317464232444763, + "step": 4980 + }, + { + "ce_loss": 0.14439506828784943, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "distill_loss": 0.1170135885477066, + "epoch": 1.6611074049366246, + "step": 4980 + }, + { + "epoch": 1.6611074049366246, + "ref_ce_loss": 0.11630581319332123, + "step": 4980 + }, + { + "epoch": 1.66444296197465, + "loss": 0.5748, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "grad_norm": 3.17610502243042, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "learning_rate": 0.0002757461313354403, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "loss": 0.6473589539527893, + "step": 4990 + }, + { + "ce_loss": 0.2640224099159241, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "distill_loss": 0.14431437849998474, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "ref_ce_loss": 0.17252643406391144, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "loss": 0.5207508206367493, + "step": 4990 + }, + { + "ce_loss": 0.1483282446861267, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "distill_loss": 0.18423455953598022, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "ref_ce_loss": 0.11322248727083206, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "loss": 0.5628471374511719, + "step": 4990 + }, + { + "ce_loss": 0.16010361909866333, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "distill_loss": 0.16277122497558594, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "ref_ce_loss": 0.08963080495595932, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "loss": 0.3767557740211487, + "step": 4990 + }, + { + "ce_loss": 0.10535683482885361, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "distill_loss": 0.13673093914985657, + "epoch": 1.66444296197465, + "step": 4990 + }, + { + "epoch": 1.66444296197465, + "ref_ce_loss": 0.0836324393749237, + "step": 4990 + }, + { + "epoch": 1.6677785190126753, + "loss": 0.6322, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "grad_norm": 1.958633303642273, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "learning_rate": 0.00027563558070310104, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "loss": 0.41848665475845337, + "step": 5000 + }, + { + "ce_loss": 0.10958019644021988, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "distill_loss": 0.13504420220851898, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "ref_ce_loss": 0.07605572789907455, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "loss": 0.6914088726043701, + "step": 5000 + }, + { + "ce_loss": 0.2166513353586197, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "distill_loss": 0.13223090767860413, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "ref_ce_loss": 0.2379608452320099, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "loss": 0.5647359490394592, + "step": 5000 + }, + { + "ce_loss": 0.23841896653175354, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "distill_loss": 0.13560470938682556, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "ref_ce_loss": 0.1557345688343048, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "loss": 0.4025331139564514, + "step": 5000 + }, + { + "ce_loss": 0.14620532095432281, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "distill_loss": 0.10365366190671921, + "epoch": 1.6677785190126753, + "step": 5000 + }, + { + "epoch": 1.6677785190126753, + "ref_ce_loss": 0.11881715804338455, + "step": 5000 + }, + { + "epoch": 1.6711140760507006, + "loss": 0.6504, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "grad_norm": 3.3348193168640137, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "learning_rate": 0.0002755248009610218, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "loss": 0.5066215991973877, + "step": 5010 + }, + { + "ce_loss": 0.19570772349834442, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "distill_loss": 0.1310369223356247, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "ref_ce_loss": 0.1483002007007599, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "loss": 0.4725184142589569, + "step": 5010 + }, + { + "ce_loss": 0.19623509049415588, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "distill_loss": 0.1314346045255661, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "ref_ce_loss": 0.14413948357105255, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "loss": 0.7104153037071228, + "step": 5010 + }, + { + "ce_loss": 0.22572150826454163, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "distill_loss": 0.14044228196144104, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "ref_ce_loss": 0.13732099533081055, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "loss": 0.8014934659004211, + "step": 5010 + }, + { + "ce_loss": 0.24018052220344543, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "distill_loss": 0.1529025137424469, + "epoch": 1.6711140760507006, + "step": 5010 + }, + { + "epoch": 1.6711140760507006, + "ref_ce_loss": 0.1667167842388153, + "step": 5010 + }, + { + "epoch": 1.674449633088726, + "loss": 0.6296, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "grad_norm": 2.7721095085144043, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "learning_rate": 0.00027541379231122115, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "loss": 0.8057844638824463, + "step": 5020 + }, + { + "ce_loss": 0.3243635594844818, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "distill_loss": 0.1830526441335678, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "ref_ce_loss": 0.2075968086719513, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "loss": 0.7243004441261292, + "step": 5020 + }, + { + "ce_loss": 0.16893361508846283, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "distill_loss": 0.0983588844537735, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "ref_ce_loss": 0.09305059164762497, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "loss": 0.7263129949569702, + "step": 5020 + }, + { + "ce_loss": 0.22422388195991516, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "distill_loss": 0.13498666882514954, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "ref_ce_loss": 0.14109773933887482, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "loss": 0.4796724021434784, + "step": 5020 + }, + { + "ce_loss": 0.18545092642307281, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "distill_loss": 0.12434764206409454, + "epoch": 1.674449633088726, + "step": 5020 + }, + { + "epoch": 1.674449633088726, + "ref_ce_loss": 0.12115143239498138, + "step": 5020 + }, + { + "epoch": 1.6777851901267513, + "loss": 0.6394, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "grad_norm": 2.989145040512085, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "learning_rate": 0.000275302554956135, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "loss": 0.7933121919631958, + "step": 5030 + }, + { + "ce_loss": 0.28813034296035767, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "distill_loss": 0.15757888555526733, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "ref_ce_loss": 0.1353616714477539, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "loss": 0.8034152984619141, + "step": 5030 + }, + { + "ce_loss": 0.21584422886371613, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "distill_loss": 0.1322658658027649, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "ref_ce_loss": 0.18953277170658112, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "loss": 1.4683685302734375, + "step": 5030 + }, + { + "ce_loss": 0.2784746587276459, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "distill_loss": 0.09764213114976883, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "ref_ce_loss": 0.2288404405117035, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "loss": 0.6212664246559143, + "step": 5030 + }, + { + "ce_loss": 0.28214043378829956, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "distill_loss": 0.13105669617652893, + "epoch": 1.6777851901267513, + "step": 5030 + }, + { + "epoch": 1.6777851901267513, + "ref_ce_loss": 0.14725418388843536, + "step": 5030 + }, + { + "epoch": 1.6811207471647767, + "loss": 0.6674, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "grad_norm": 1.8358932733535767, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "learning_rate": 0.0002751910890986164, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "loss": 0.6445333957672119, + "step": 5040 + }, + { + "ce_loss": 0.14290131628513336, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "distill_loss": 0.09842705726623535, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "ref_ce_loss": 0.1507960706949234, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "loss": 0.6434434056282043, + "step": 5040 + }, + { + "ce_loss": 0.1379636973142624, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "distill_loss": 0.12902599573135376, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "ref_ce_loss": 0.12259085476398468, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "loss": 0.3519287407398224, + "step": 5040 + }, + { + "ce_loss": 0.14540047943592072, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "distill_loss": 0.10962574928998947, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "ref_ce_loss": 0.09651947766542435, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "loss": 0.4088886082172394, + "step": 5040 + }, + { + "ce_loss": 0.10746689885854721, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "distill_loss": 0.1186119094491005, + "epoch": 1.6811207471647767, + "step": 5040 + }, + { + "epoch": 1.6811207471647767, + "ref_ce_loss": 0.12964355945587158, + "step": 5040 + }, + { + "epoch": 1.684456304202802, + "loss": 0.6478, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "grad_norm": 2.647010326385498, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "learning_rate": 0.0002750793949419351, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "loss": 0.4091527462005615, + "step": 5050 + }, + { + "ce_loss": 0.12278663367033005, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "distill_loss": 0.17358151078224182, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "ref_ce_loss": 0.1125297099351883, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "loss": 0.547415018081665, + "step": 5050 + }, + { + "ce_loss": 0.1731032133102417, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "distill_loss": 0.13061532378196716, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "ref_ce_loss": 0.16719424724578857, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "loss": 0.6925753951072693, + "step": 5050 + }, + { + "ce_loss": 0.28311067819595337, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "distill_loss": 0.193869948387146, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "ref_ce_loss": 0.1512855738401413, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "loss": 0.7059001922607422, + "step": 5050 + }, + { + "ce_loss": 0.27139025926589966, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "distill_loss": 0.16281284391880035, + "epoch": 1.684456304202802, + "step": 5050 + }, + { + "epoch": 1.684456304202802, + "ref_ce_loss": 0.19652172923088074, + "step": 5050 + }, + { + "epoch": 1.6877918612408274, + "loss": 0.603, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "grad_norm": 2.900559902191162, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "learning_rate": 0.0002749674726897773, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "loss": 0.4165264070034027, + "step": 5060 + }, + { + "ce_loss": 0.14308080077171326, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "distill_loss": 0.12662950158119202, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "ref_ce_loss": 0.1465776264667511, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "loss": 0.5354675650596619, + "step": 5060 + }, + { + "ce_loss": 0.21277961134910583, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "distill_loss": 0.16139402985572815, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "ref_ce_loss": 0.08568815141916275, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "loss": 0.7034105658531189, + "step": 5060 + }, + { + "ce_loss": 0.2749280035495758, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "distill_loss": 0.17789869010448456, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "ref_ce_loss": 0.19476871192455292, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "loss": 0.44613534212112427, + "step": 5060 + }, + { + "ce_loss": 0.18843120336532593, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "distill_loss": 0.14277829229831696, + "epoch": 1.6877918612408274, + "step": 5060 + }, + { + "epoch": 1.6877918612408274, + "ref_ce_loss": 0.11453019082546234, + "step": 5060 + }, + { + "epoch": 1.6911274182788527, + "loss": 0.5778, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "grad_norm": 2.2885119915008545, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "learning_rate": 0.000274855322546245, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "loss": 0.3868217468261719, + "step": 5070 + }, + { + "ce_loss": 0.15230241417884827, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "distill_loss": 0.092768594622612, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "ref_ce_loss": 0.07845815271139145, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "loss": 0.7932264804840088, + "step": 5070 + }, + { + "ce_loss": 0.2804737091064453, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "distill_loss": 0.12050312012434006, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "ref_ce_loss": 0.20722705125808716, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "loss": 0.8451511263847351, + "step": 5070 + }, + { + "ce_loss": 0.246019646525383, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "distill_loss": 0.10395143181085587, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "ref_ce_loss": 0.1522131860256195, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "loss": 0.49361860752105713, + "step": 5070 + }, + { + "ce_loss": 0.19995959103107452, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "distill_loss": 0.10739613324403763, + "epoch": 1.6911274182788527, + "step": 5070 + }, + { + "epoch": 1.6911274182788527, + "ref_ce_loss": 0.1293676495552063, + "step": 5070 + }, + { + "epoch": 1.694462975316878, + "loss": 0.6237, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "grad_norm": 3.2725327014923096, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "learning_rate": 0.00027474294471585564, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "loss": 0.740882933139801, + "step": 5080 + }, + { + "ce_loss": 0.27106061577796936, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "distill_loss": 0.12209445983171463, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "ref_ce_loss": 0.1586221605539322, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "loss": 0.882435142993927, + "step": 5080 + }, + { + "ce_loss": 0.14245213568210602, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "distill_loss": 0.10405879467725754, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "ref_ce_loss": 0.18394401669502258, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "loss": 0.4717642664909363, + "step": 5080 + }, + { + "ce_loss": 0.21122273802757263, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "distill_loss": 0.11571598052978516, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "ref_ce_loss": 0.10213414579629898, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "loss": 0.5567148923873901, + "step": 5080 + }, + { + "ce_loss": 0.2327110916376114, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "distill_loss": 0.1081765741109848, + "epoch": 1.694462975316878, + "step": 5080 + }, + { + "epoch": 1.694462975316878, + "ref_ce_loss": 0.12438291311264038, + "step": 5080 + }, + { + "epoch": 1.6977985323549034, + "loss": 0.6664, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "grad_norm": 4.2465643882751465, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "learning_rate": 0.0002746303394035423, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "loss": 0.7007617354393005, + "step": 5090 + }, + { + "ce_loss": 0.2409987598657608, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "distill_loss": 0.2312084138393402, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "ref_ce_loss": 0.13980107009410858, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "loss": 0.8144757747650146, + "step": 5090 + }, + { + "ce_loss": 0.18151697516441345, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "distill_loss": 0.2494632452726364, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "ref_ce_loss": 0.17133742570877075, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "loss": 0.6633246541023254, + "step": 5090 + }, + { + "ce_loss": 0.23428836464881897, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "distill_loss": 0.21696534752845764, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "ref_ce_loss": 0.1617213934659958, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "loss": 1.046887993812561, + "step": 5090 + }, + { + "ce_loss": 0.19139772653579712, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "distill_loss": 0.31649070978164673, + "epoch": 1.6977985323549034, + "step": 5090 + }, + { + "epoch": 1.6977985323549034, + "ref_ce_loss": 0.10765596479177475, + "step": 5090 + }, + { + "epoch": 1.7011340893929288, + "loss": 0.6845, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "grad_norm": 3.3320302963256836, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "learning_rate": 0.00027451750681465253, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "loss": 0.5628963112831116, + "step": 5100 + }, + { + "ce_loss": 0.1644999235868454, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "distill_loss": 0.1443501114845276, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "ref_ce_loss": 0.09509146213531494, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "loss": 0.6696035861968994, + "step": 5100 + }, + { + "ce_loss": 0.2381826490163803, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "distill_loss": 0.1346670538187027, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "ref_ce_loss": 0.15772823989391327, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "loss": 0.6247545480728149, + "step": 5100 + }, + { + "ce_loss": 0.22702516615390778, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "distill_loss": 0.13249506056308746, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "ref_ce_loss": 0.156106099486351, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "loss": 0.6921025514602661, + "step": 5100 + }, + { + "ce_loss": 0.2253514677286148, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "distill_loss": 0.15082751214504242, + "epoch": 1.7011340893929288, + "step": 5100 + }, + { + "epoch": 1.7011340893929288, + "ref_ce_loss": 0.159327894449234, + "step": 5100 + }, + { + "epoch": 1.704469646430954, + "loss": 0.5672, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "grad_norm": 1.8688466548919678, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "learning_rate": 0.00027440444715494844, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "loss": 0.5153326988220215, + "step": 5110 + }, + { + "ce_loss": 0.2143237441778183, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "distill_loss": 0.12158432602882385, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "ref_ce_loss": 0.1791994422674179, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "loss": 0.6404139995574951, + "step": 5110 + }, + { + "ce_loss": 0.22489234805107117, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "distill_loss": 0.10260467231273651, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "ref_ce_loss": 0.13702525198459625, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "loss": 0.5092939734458923, + "step": 5110 + }, + { + "ce_loss": 0.19932390749454498, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "distill_loss": 0.11346331983804703, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "ref_ce_loss": 0.12395470589399338, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "loss": 0.7422287464141846, + "step": 5110 + }, + { + "ce_loss": 0.16377632319927216, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "distill_loss": 0.11026269942522049, + "epoch": 1.704469646430954, + "step": 5110 + }, + { + "epoch": 1.704469646430954, + "ref_ce_loss": 0.08629206568002701, + "step": 5110 + }, + { + "epoch": 1.7078052034689795, + "loss": 0.639, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "grad_norm": 2.86659574508667, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "learning_rate": 0.0002742911606306063, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "loss": 0.4779875576496124, + "step": 5120 + }, + { + "ce_loss": 0.1897718906402588, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "distill_loss": 0.11675845831632614, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "ref_ce_loss": 0.12058022618293762, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "loss": 0.46407264471054077, + "step": 5120 + }, + { + "ce_loss": 0.11707013100385666, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "distill_loss": 0.08361580967903137, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "ref_ce_loss": 0.09062016010284424, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "loss": 0.5182336568832397, + "step": 5120 + }, + { + "ce_loss": 0.23112425208091736, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "distill_loss": 0.10435008257627487, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "ref_ce_loss": 0.13188083469867706, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "loss": 0.47077101469039917, + "step": 5120 + }, + { + "ce_loss": 0.1601075679063797, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "distill_loss": 0.12487640976905823, + "epoch": 1.7078052034689795, + "step": 5120 + }, + { + "epoch": 1.7078052034689795, + "ref_ce_loss": 0.11737626045942307, + "step": 5120 + }, + { + "epoch": 1.7111407605070048, + "loss": 0.5843, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "grad_norm": 3.353935718536377, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "learning_rate": 0.00027417764744821604, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "loss": 0.5513918995857239, + "step": 5130 + }, + { + "ce_loss": 0.16730625927448273, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "distill_loss": 0.12257963418960571, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "ref_ce_loss": 0.11809415370225906, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "loss": 0.6963628530502319, + "step": 5130 + }, + { + "ce_loss": 0.22674696147441864, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "distill_loss": 0.1302148550748825, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "ref_ce_loss": 0.11398036032915115, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "loss": 0.8863782286643982, + "step": 5130 + }, + { + "ce_loss": 0.3215571343898773, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "distill_loss": 0.12419331818819046, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "ref_ce_loss": 0.19529564678668976, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "loss": 0.7732776999473572, + "step": 5130 + }, + { + "ce_loss": 0.32119056582450867, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "distill_loss": 0.12385230511426926, + "epoch": 1.7111407605070048, + "step": 5130 + }, + { + "epoch": 1.7111407605070048, + "ref_ce_loss": 0.16419844329357147, + "step": 5130 + }, + { + "epoch": 1.7144763175450302, + "loss": 0.553, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "grad_norm": 2.516479015350342, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "learning_rate": 0.00027406390781478093, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "loss": 0.5765625238418579, + "step": 5140 + }, + { + "ce_loss": 0.1710415482521057, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "distill_loss": 0.11104995012283325, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "ref_ce_loss": 0.09997311234474182, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "loss": 0.44053176045417786, + "step": 5140 + }, + { + "ce_loss": 0.19158673286437988, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "distill_loss": 0.11490902304649353, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "ref_ce_loss": 0.09579595178365707, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "loss": 0.5100491046905518, + "step": 5140 + }, + { + "ce_loss": 0.18296188116073608, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "distill_loss": 0.11433611810207367, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "ref_ce_loss": 0.07839963585138321, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "loss": 0.4291071593761444, + "step": 5140 + }, + { + "ce_loss": 0.15751199424266815, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "distill_loss": 0.09726285189390182, + "epoch": 1.7144763175450302, + "step": 5140 + }, + { + "epoch": 1.7144763175450302, + "ref_ce_loss": 0.12397287786006927, + "step": 5140 + }, + { + "epoch": 1.7178118745830555, + "loss": 0.5899, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "grad_norm": 2.534189224243164, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "learning_rate": 0.00027394994193771717, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "loss": 1.122070550918579, + "step": 5150 + }, + { + "ce_loss": 0.2445487380027771, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "distill_loss": 0.135958731174469, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "ref_ce_loss": 0.13551239669322968, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "loss": 0.5136386156082153, + "step": 5150 + }, + { + "ce_loss": 0.2333669662475586, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "distill_loss": 0.09331156313419342, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "ref_ce_loss": 0.13432331383228302, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "loss": 0.5635353326797485, + "step": 5150 + }, + { + "ce_loss": 0.10622437298297882, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "distill_loss": 0.07141038030385971, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "ref_ce_loss": 0.1337660551071167, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "loss": 0.5571033358573914, + "step": 5150 + }, + { + "ce_loss": 0.20822681486606598, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "distill_loss": 0.14473620057106018, + "epoch": 1.7178118745830555, + "step": 5150 + }, + { + "epoch": 1.7178118745830555, + "ref_ce_loss": 0.14581289887428284, + "step": 5150 + }, + { + "epoch": 1.7211474316210809, + "loss": 0.6477, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "grad_norm": 3.7310683727264404, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "learning_rate": 0.0002738357500248536, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "loss": 0.4637722969055176, + "step": 5160 + }, + { + "ce_loss": 0.22166353464126587, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "distill_loss": 0.11157701909542084, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "ref_ce_loss": 0.1303255707025528, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "loss": 0.7100945711135864, + "step": 5160 + }, + { + "ce_loss": 0.1325289011001587, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "distill_loss": 0.10556505620479584, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "ref_ce_loss": 0.09780916571617126, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "loss": 0.6478838920593262, + "step": 5160 + }, + { + "ce_loss": 0.214467391371727, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "distill_loss": 0.16932272911071777, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "ref_ce_loss": 0.09977413713932037, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "loss": 0.2837287187576294, + "step": 5160 + }, + { + "ce_loss": 0.10429967194795609, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "distill_loss": 0.07188081741333008, + "epoch": 1.7211474316210809, + "step": 5160 + }, + { + "epoch": 1.7211474316210809, + "ref_ce_loss": 0.0646226704120636, + "step": 5160 + }, + { + "epoch": 1.7244829886591062, + "loss": 0.6087, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "grad_norm": 3.750103235244751, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "learning_rate": 0.0002737213322844312, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "loss": 0.7976568937301636, + "step": 5170 + }, + { + "ce_loss": 0.1181984469294548, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "distill_loss": 0.098573699593544, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "ref_ce_loss": 0.1335592269897461, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "loss": 0.6125006675720215, + "step": 5170 + }, + { + "ce_loss": 0.22754055261611938, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "distill_loss": 0.12978272140026093, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "ref_ce_loss": 0.15586386620998383, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "loss": 0.4436233639717102, + "step": 5170 + }, + { + "ce_loss": 0.09966287016868591, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "distill_loss": 0.09829822927713394, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "ref_ce_loss": 0.10884232074022293, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "loss": 0.9224821329116821, + "step": 5170 + }, + { + "ce_loss": 0.17320190370082855, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "distill_loss": 0.12536008656024933, + "epoch": 1.7244829886591062, + "step": 5170 + }, + { + "epoch": 1.7244829886591062, + "ref_ce_loss": 0.12005515396595001, + "step": 5170 + }, + { + "epoch": 1.7278185456971316, + "loss": 0.6052, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "grad_norm": 2.824880838394165, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "learning_rate": 0.0002736066889251028, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "loss": 0.5080110430717468, + "step": 5180 + }, + { + "ce_loss": 0.20835170149803162, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "distill_loss": 0.11417423188686371, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "ref_ce_loss": 0.18518763780593872, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "loss": 0.9636545181274414, + "step": 5180 + }, + { + "ce_loss": 0.24943476915359497, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "distill_loss": 0.15834645926952362, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "ref_ce_loss": 0.18894070386886597, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "loss": 0.39006200432777405, + "step": 5180 + }, + { + "ce_loss": 0.1016741469502449, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "distill_loss": 0.09278419613838196, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "ref_ce_loss": 0.12148765474557877, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "loss": 0.5657213926315308, + "step": 5180 + }, + { + "ce_loss": 0.18592000007629395, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "distill_loss": 0.11897885799407959, + "epoch": 1.7278185456971316, + "step": 5180 + }, + { + "epoch": 1.7278185456971316, + "ref_ce_loss": 0.12840834259986877, + "step": 5180 + }, + { + "epoch": 1.731154102735157, + "loss": 0.6304, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "grad_norm": 2.494673013687134, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "learning_rate": 0.0002734918201559326, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "loss": 0.47465330362319946, + "step": 5190 + }, + { + "ce_loss": 0.1857297122478485, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "distill_loss": 0.12139198184013367, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "ref_ce_loss": 0.1320515125989914, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "loss": 1.148721694946289, + "step": 5190 + }, + { + "ce_loss": 0.24933886528015137, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "distill_loss": 0.14437663555145264, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "ref_ce_loss": 0.1374908685684204, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "loss": 0.31072619557380676, + "step": 5190 + }, + { + "ce_loss": 0.10767339915037155, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "distill_loss": 0.13216081261634827, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "ref_ce_loss": 0.07079358398914337, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "loss": 0.3686656057834625, + "step": 5190 + }, + { + "ce_loss": 0.13234637677669525, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "distill_loss": 0.11700987070798874, + "epoch": 1.731154102735157, + "step": 5190 + }, + { + "epoch": 1.731154102735157, + "ref_ce_loss": 0.11908116191625595, + "step": 5190 + }, + { + "epoch": 1.7344896597731823, + "loss": 0.6093, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "grad_norm": 4.322832107543945, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "learning_rate": 0.00027337672618639604, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "loss": 0.6111628413200378, + "step": 5200 + }, + { + "ce_loss": 0.15295933187007904, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "distill_loss": 0.10183216631412506, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "ref_ce_loss": 0.14784273505210876, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "loss": 0.875718891620636, + "step": 5200 + }, + { + "ce_loss": 0.28108495473861694, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "distill_loss": 0.1419355273246765, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "ref_ce_loss": 0.20049789547920227, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "loss": 0.5750555396080017, + "step": 5200 + }, + { + "ce_loss": 0.18172144889831543, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "distill_loss": 0.10994206368923187, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "ref_ce_loss": 0.11367204785346985, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "loss": 0.5577737092971802, + "step": 5200 + }, + { + "ce_loss": 0.2857550382614136, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "distill_loss": 0.12473808228969574, + "epoch": 1.7344896597731823, + "step": 5200 + }, + { + "epoch": 1.7344896597731823, + "ref_ce_loss": 0.1469634473323822, + "step": 5200 + }, + { + "epoch": 1.7378252168112076, + "loss": 0.588, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "grad_norm": 1.895171046257019, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "learning_rate": 0.0002732614072263791, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "loss": 0.6798540949821472, + "step": 5210 + }, + { + "ce_loss": 0.20952335000038147, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "distill_loss": 0.11695443838834763, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "ref_ce_loss": 0.11746593564748764, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "loss": 0.26200753450393677, + "step": 5210 + }, + { + "ce_loss": 0.07369393855333328, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "distill_loss": 0.06114661321043968, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "ref_ce_loss": 0.07951971143484116, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "loss": 0.6085673570632935, + "step": 5210 + }, + { + "ce_loss": 0.24875447154045105, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "distill_loss": 0.10614325106143951, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "ref_ce_loss": 0.1640029400587082, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "loss": 1.0081214904785156, + "step": 5210 + }, + { + "ce_loss": 0.21574634313583374, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "distill_loss": 0.07902166247367859, + "epoch": 1.7378252168112076, + "step": 5210 + }, + { + "epoch": 1.7378252168112076, + "ref_ce_loss": 0.10362114012241364, + "step": 5210 + }, + { + "epoch": 1.741160773849233, + "loss": 0.6313, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "grad_norm": 2.011845111846924, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "learning_rate": 0.00027314586348617793, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "loss": 0.6953931450843811, + "step": 5220 + }, + { + "ce_loss": 0.23883208632469177, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "distill_loss": 0.11485342681407928, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "ref_ce_loss": 0.14983177185058594, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "loss": 0.5598002672195435, + "step": 5220 + }, + { + "ce_loss": 0.14301392436027527, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "distill_loss": 0.11065421998500824, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "ref_ce_loss": 0.15477022528648376, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "loss": 0.5702969431877136, + "step": 5220 + }, + { + "ce_loss": 0.23705795407295227, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "distill_loss": 0.09813598543405533, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "ref_ce_loss": 0.23498722910881042, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "loss": 0.4153706431388855, + "step": 5220 + }, + { + "ce_loss": 0.15375342965126038, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "distill_loss": 0.09605289995670319, + "epoch": 1.741160773849233, + "step": 5220 + }, + { + "epoch": 1.741160773849233, + "ref_ce_loss": 0.09962562471628189, + "step": 5220 + }, + { + "epoch": 1.7444963308872583, + "loss": 0.6265, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "grad_norm": 3.9798173904418945, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "learning_rate": 0.0002730300951764989, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "loss": 1.071535587310791, + "step": 5230 + }, + { + "ce_loss": 0.24925430119037628, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "distill_loss": 0.21522605419158936, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "ref_ce_loss": 0.17369335889816284, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "loss": 0.648411750793457, + "step": 5230 + }, + { + "ce_loss": 0.243378147482872, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "distill_loss": 0.16543267667293549, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "ref_ce_loss": 0.1546335220336914, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "loss": 1.0082130432128906, + "step": 5230 + }, + { + "ce_loss": 0.30150628089904785, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "distill_loss": 0.19451621174812317, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "ref_ce_loss": 0.12648442387580872, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "loss": 0.4836832284927368, + "step": 5230 + }, + { + "ce_loss": 0.15982374548912048, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "distill_loss": 0.17217841744422913, + "epoch": 1.7444963308872583, + "step": 5230 + }, + { + "epoch": 1.7444963308872583, + "ref_ce_loss": 0.0835745707154274, + "step": 5230 + }, + { + "epoch": 1.7478318879252837, + "loss": 0.6475, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "grad_norm": 2.498871088027954, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "learning_rate": 0.0002729141025084577, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "loss": 0.6212328672409058, + "step": 5240 + }, + { + "ce_loss": 0.15493199229240417, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "distill_loss": 0.1978866457939148, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "ref_ce_loss": 0.13444751501083374, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "loss": 0.5558438897132874, + "step": 5240 + }, + { + "ce_loss": 0.21160100400447845, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "distill_loss": 0.1478114128112793, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "ref_ce_loss": 0.1627282351255417, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "loss": 0.9472699165344238, + "step": 5240 + }, + { + "ce_loss": 0.25463220477104187, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "distill_loss": 0.1912805289030075, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "ref_ce_loss": 0.13427430391311646, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "loss": 0.5777902007102966, + "step": 5240 + }, + { + "ce_loss": 0.2193995863199234, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "distill_loss": 0.19780264794826508, + "epoch": 1.7478318879252837, + "step": 5240 + }, + { + "epoch": 1.7478318879252837, + "ref_ce_loss": 0.16050836443901062, + "step": 5240 + }, + { + "epoch": 1.751167444963309, + "loss": 0.6298, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "grad_norm": 3.098067283630371, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "learning_rate": 0.00027279788569357916, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "loss": 0.6694287061691284, + "step": 5250 + }, + { + "ce_loss": 0.2001875340938568, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "distill_loss": 0.13199788331985474, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "ref_ce_loss": 0.17222945392131805, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "loss": 0.30083397030830383, + "step": 5250 + }, + { + "ce_loss": 0.10140173882246017, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "distill_loss": 0.10152903199195862, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "ref_ce_loss": 0.09783918410539627, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "loss": 0.5701338648796082, + "step": 5250 + }, + { + "ce_loss": 0.2400723248720169, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "distill_loss": 0.1128765121102333, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "ref_ce_loss": 0.14214551448822021, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "loss": 1.617339849472046, + "step": 5250 + }, + { + "ce_loss": 0.34532344341278076, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "distill_loss": 0.20758526027202606, + "epoch": 1.751167444963309, + "step": 5250 + }, + { + "epoch": 1.751167444963309, + "ref_ce_loss": 0.2039952427148819, + "step": 5250 + }, + { + "epoch": 1.7545030020013344, + "loss": 0.6432, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "grad_norm": 3.359388828277588, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "learning_rate": 0.0002726814449437969, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "loss": 0.35906630754470825, + "step": 5260 + }, + { + "ce_loss": 0.12533292174339294, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "distill_loss": 0.0773266926407814, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "ref_ce_loss": 0.09706180542707443, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "loss": 0.8488431572914124, + "step": 5260 + }, + { + "ce_loss": 0.19705916941165924, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "distill_loss": 0.12958835065364838, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "ref_ce_loss": 0.17487740516662598, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "loss": 0.6030906438827515, + "step": 5260 + }, + { + "ce_loss": 0.17236897349357605, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "distill_loss": 0.11375070363283157, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "ref_ce_loss": 0.14064763486385345, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "loss": 0.8054187297821045, + "step": 5260 + }, + { + "ce_loss": 0.2541971206665039, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "distill_loss": 0.14604657888412476, + "epoch": 1.7545030020013344, + "step": 5260 + }, + { + "epoch": 1.7545030020013344, + "ref_ce_loss": 0.1376919001340866, + "step": 5260 + }, + { + "epoch": 1.7578385590393597, + "loss": 0.6159, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "grad_norm": 3.611070394515991, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "learning_rate": 0.00027256478047145297, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "loss": 0.4489757716655731, + "step": 5270 + }, + { + "ce_loss": 0.15936985611915588, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "distill_loss": 0.13974682986736298, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "ref_ce_loss": 0.12064920365810394, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "loss": 0.35650843381881714, + "step": 5270 + }, + { + "ce_loss": 0.07867399603128433, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "distill_loss": 0.11882077902555466, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "ref_ce_loss": 0.09165604412555695, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "loss": 0.40967947244644165, + "step": 5270 + }, + { + "ce_loss": 0.1292242407798767, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "distill_loss": 0.12140464037656784, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "ref_ce_loss": 0.10581637173891068, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "loss": 0.6546857357025146, + "step": 5270 + }, + { + "ce_loss": 0.2300027757883072, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "distill_loss": 0.15563297271728516, + "epoch": 1.7578385590393597, + "step": 5270 + }, + { + "epoch": 1.7578385590393597, + "ref_ce_loss": 0.126151442527771, + "step": 5270 + }, + { + "epoch": 1.761174116077385, + "loss": 0.5937, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "grad_norm": 2.565040349960327, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "learning_rate": 0.00027244789248929735, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "loss": 0.4498351812362671, + "step": 5280 + }, + { + "ce_loss": 0.10470875352621078, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "distill_loss": 0.14078587293624878, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "ref_ce_loss": 0.08281106501817703, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "loss": 0.5353066921234131, + "step": 5280 + }, + { + "ce_loss": 0.1827135682106018, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "distill_loss": 0.12412932515144348, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "ref_ce_loss": 0.1676645129919052, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "loss": 0.39925214648246765, + "step": 5280 + }, + { + "ce_loss": 0.11564359813928604, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "distill_loss": 0.13008667528629303, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "ref_ce_loss": 0.11450548470020294, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "loss": 0.9488141536712646, + "step": 5280 + }, + { + "ce_loss": 0.296876460313797, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "distill_loss": 0.13965021073818207, + "epoch": 1.761174116077385, + "step": 5280 + }, + { + "epoch": 1.761174116077385, + "ref_ce_loss": 0.2269418090581894, + "step": 5280 + }, + { + "epoch": 1.7645096731154104, + "loss": 0.5795, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "grad_norm": 4.342066287994385, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "learning_rate": 0.0002723307812104875, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "loss": 0.4556236267089844, + "step": 5290 + }, + { + "ce_loss": 0.19900187849998474, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "distill_loss": 0.12070953100919724, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "ref_ce_loss": 0.1357794851064682, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "loss": 0.49381351470947266, + "step": 5290 + }, + { + "ce_loss": 0.22090773284435272, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "distill_loss": 0.15897664427757263, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "ref_ce_loss": 0.11341068148612976, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "loss": 0.8587750196456909, + "step": 5290 + }, + { + "ce_loss": 0.3212994337081909, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "distill_loss": 0.189230814576149, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "ref_ce_loss": 0.17310874164104462, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "loss": 1.1447008848190308, + "step": 5290 + }, + { + "ce_loss": 0.16058073937892914, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "distill_loss": 0.14463923871517181, + "epoch": 1.7645096731154104, + "step": 5290 + }, + { + "epoch": 1.7645096731154104, + "ref_ce_loss": 0.12579326331615448, + "step": 5290 + }, + { + "epoch": 1.7678452301534358, + "loss": 0.6033, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "grad_norm": 3.2570016384124756, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "learning_rate": 0.00027221344684858834, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "loss": 0.5072019696235657, + "step": 5300 + }, + { + "ce_loss": 0.15257352590560913, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "distill_loss": 0.13202786445617676, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "ref_ce_loss": 0.12185128033161163, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "loss": 0.4970143437385559, + "step": 5300 + }, + { + "ce_loss": 0.19305184483528137, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "distill_loss": 0.15051575005054474, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "ref_ce_loss": 0.12638255953788757, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "loss": 0.6955916881561279, + "step": 5300 + }, + { + "ce_loss": 0.20245814323425293, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "distill_loss": 0.14921513199806213, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "ref_ce_loss": 0.16635259985923767, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "loss": 0.6717430353164673, + "step": 5300 + }, + { + "ce_loss": 0.2943767011165619, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "distill_loss": 0.2113085836172104, + "epoch": 1.7678452301534358, + "step": 5300 + }, + { + "epoch": 1.7678452301534358, + "ref_ce_loss": 0.16593855619430542, + "step": 5300 + }, + { + "epoch": 1.771180787191461, + "loss": 0.6271, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "grad_norm": 3.04647159576416, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "learning_rate": 0.00027209588961757137, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "loss": 1.033667802810669, + "step": 5310 + }, + { + "ce_loss": 0.18642698228359222, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "distill_loss": 0.09144186973571777, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "ref_ce_loss": 0.12195436656475067, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "loss": 0.5744385719299316, + "step": 5310 + }, + { + "ce_loss": 0.19218167662620544, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "distill_loss": 0.0962839350104332, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "ref_ce_loss": 0.14160801470279694, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "loss": 0.5619361996650696, + "step": 5310 + }, + { + "ce_loss": 0.2757478356361389, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "distill_loss": 0.14615021646022797, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "ref_ce_loss": 0.13982820510864258, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "loss": 0.38822707533836365, + "step": 5310 + }, + { + "ce_loss": 0.18420282006263733, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "distill_loss": 0.10989782214164734, + "epoch": 1.771180787191461, + "step": 5310 + }, + { + "epoch": 1.771180787191461, + "ref_ce_loss": 0.09404101222753525, + "step": 5310 + }, + { + "epoch": 1.7745163442294865, + "loss": 0.6356, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "grad_norm": 2.576591730117798, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "learning_rate": 0.0002719781097318147, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "loss": 1.1352790594100952, + "step": 5320 + }, + { + "ce_loss": 0.3041479289531708, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "distill_loss": 0.0969984382390976, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "ref_ce_loss": 0.23569048941135406, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "loss": 1.0350360870361328, + "step": 5320 + }, + { + "ce_loss": 0.3750414252281189, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "distill_loss": 0.15197308361530304, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "ref_ce_loss": 0.2801123857498169, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "loss": 0.6875044107437134, + "step": 5320 + }, + { + "ce_loss": 0.19332902133464813, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "distill_loss": 0.09888143092393875, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "ref_ce_loss": 0.1342381089925766, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "loss": 0.4631473124027252, + "step": 5320 + }, + { + "ce_loss": 0.15175089240074158, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "distill_loss": 0.09046868234872818, + "epoch": 1.7745163442294865, + "step": 5320 + }, + { + "epoch": 1.7745163442294865, + "ref_ce_loss": 0.1468697339296341, + "step": 5320 + }, + { + "epoch": 1.7778519012675118, + "loss": 0.6277, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "grad_norm": 3.112802505493164, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "learning_rate": 0.00027186010740610226, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "loss": 1.1258635520935059, + "step": 5330 + }, + { + "ce_loss": 0.23738060891628265, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "distill_loss": 0.12533307075500488, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "ref_ce_loss": 0.16516506671905518, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "loss": 0.9647006988525391, + "step": 5330 + }, + { + "ce_loss": 0.199024498462677, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "distill_loss": 0.1232900470495224, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "ref_ce_loss": 0.1646987795829773, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "loss": 0.4685381054878235, + "step": 5330 + }, + { + "ce_loss": 0.14527423679828644, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "distill_loss": 0.11627337336540222, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "ref_ce_loss": 0.10118523985147476, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "loss": 0.3879113793373108, + "step": 5330 + }, + { + "ce_loss": 0.15242859721183777, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "distill_loss": 0.0961771309375763, + "epoch": 1.7778519012675118, + "step": 5330 + }, + { + "epoch": 1.7778519012675118, + "ref_ce_loss": 0.09178163856267929, + "step": 5330 + }, + { + "epoch": 1.7811874583055372, + "loss": 0.5619, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "grad_norm": 5.199608325958252, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "learning_rate": 0.00027174188285562377, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "loss": 0.8281639814376831, + "step": 5340 + }, + { + "ce_loss": 0.1604779213666916, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "distill_loss": 0.11927327513694763, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "ref_ce_loss": 0.16226066648960114, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "loss": 1.659521460533142, + "step": 5340 + }, + { + "ce_loss": 0.2811453640460968, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "distill_loss": 0.15258263051509857, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "ref_ce_loss": 0.19549334049224854, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "loss": 0.26514604687690735, + "step": 5340 + }, + { + "ce_loss": 0.029982445761561394, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "distill_loss": 0.07369742542505264, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "ref_ce_loss": 0.10002217441797256, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "loss": 0.3143136501312256, + "step": 5340 + }, + { + "ce_loss": 0.07460236549377441, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "distill_loss": 0.08271145820617676, + "epoch": 1.7811874583055372, + "step": 5340 + }, + { + "epoch": 1.7811874583055372, + "ref_ce_loss": 0.11252786964178085, + "step": 5340 + }, + { + "epoch": 1.7845230153435625, + "loss": 0.6054, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "grad_norm": 1.8053847551345825, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "learning_rate": 0.00027162343629597425, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "loss": 0.4593953788280487, + "step": 5350 + }, + { + "ce_loss": 0.04222152754664421, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "distill_loss": 0.07897884398698807, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "ref_ce_loss": 0.11623603850603104, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "loss": 0.518944263458252, + "step": 5350 + }, + { + "ce_loss": 0.17399434745311737, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "distill_loss": 0.14518676698207855, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "ref_ce_loss": 0.12270195037126541, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "loss": 0.7417664527893066, + "step": 5350 + }, + { + "ce_loss": 0.21204745769500732, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "distill_loss": 0.13328410685062408, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "ref_ce_loss": 0.07787536084651947, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "loss": 0.39500027894973755, + "step": 5350 + }, + { + "ce_loss": 0.1639915108680725, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "distill_loss": 0.10372168570756912, + "epoch": 1.7845230153435625, + "step": 5350 + }, + { + "epoch": 1.7845230153435625, + "ref_ce_loss": 0.0849570780992508, + "step": 5350 + }, + { + "epoch": 1.7878585723815879, + "loss": 0.6098, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "grad_norm": 2.036372184753418, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "learning_rate": 0.00027150476794315345, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "loss": 0.365789532661438, + "step": 5360 + }, + { + "ce_loss": 0.12750551104545593, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "distill_loss": 0.10385686159133911, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "ref_ce_loss": 0.13411588966846466, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "loss": 0.37090298533439636, + "step": 5360 + }, + { + "ce_loss": 0.13140667974948883, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "distill_loss": 0.1126258373260498, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "ref_ce_loss": 0.09275903552770615, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "loss": 1.2308400869369507, + "step": 5360 + }, + { + "ce_loss": 0.21442733705043793, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "distill_loss": 0.1381993144750595, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "ref_ce_loss": 0.13881564140319824, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "loss": 0.38338571786880493, + "step": 5360 + }, + { + "ce_loss": 0.11972062289714813, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "distill_loss": 0.06880615651607513, + "epoch": 1.7878585723815879, + "step": 5360 + }, + { + "epoch": 1.7878585723815879, + "ref_ce_loss": 0.07677696645259857, + "step": 5360 + }, + { + "epoch": 1.7911941294196132, + "loss": 0.5662, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "grad_norm": 2.8769922256469727, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "learning_rate": 0.0002713858780135657, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "loss": 0.6280443668365479, + "step": 5370 + }, + { + "ce_loss": 0.16620515286922455, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "distill_loss": 0.0986478179693222, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "ref_ce_loss": 0.1469651609659195, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "loss": 0.8258258700370789, + "step": 5370 + }, + { + "ce_loss": 0.17810150980949402, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "distill_loss": 0.11801296472549438, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "ref_ce_loss": 0.23140060901641846, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "loss": 0.4521212577819824, + "step": 5370 + }, + { + "ce_loss": 0.10244987159967422, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "distill_loss": 0.08709345757961273, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "ref_ce_loss": 0.1392844021320343, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "loss": 0.5021611452102661, + "step": 5370 + }, + { + "ce_loss": 0.20533789694309235, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "distill_loss": 0.1511787325143814, + "epoch": 1.7911941294196132, + "step": 5370 + }, + { + "epoch": 1.7911941294196132, + "ref_ce_loss": 0.09348888695240021, + "step": 5370 + }, + { + "epoch": 1.7945296864576386, + "loss": 0.6176, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "grad_norm": 2.5719034671783447, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "learning_rate": 0.00027126676672401917, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "loss": 0.36035069823265076, + "step": 5380 + }, + { + "ce_loss": 0.11971081793308258, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "distill_loss": 0.09691092371940613, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "ref_ce_loss": 0.09212952107191086, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "loss": 0.5740700960159302, + "step": 5380 + }, + { + "ce_loss": 0.19906161725521088, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "distill_loss": 0.10676465183496475, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "ref_ce_loss": 0.13155211508274078, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "loss": 0.7756186127662659, + "step": 5380 + }, + { + "ce_loss": 0.19126425683498383, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "distill_loss": 0.14128831028938293, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "ref_ce_loss": 0.16342337429523468, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "loss": 0.33875101804733276, + "step": 5380 + }, + { + "ce_loss": 0.1185140386223793, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "distill_loss": 0.09045391529798508, + "epoch": 1.7945296864576386, + "step": 5380 + }, + { + "epoch": 1.7945296864576386, + "ref_ce_loss": 0.08819026499986649, + "step": 5380 + }, + { + "epoch": 1.797865243495664, + "loss": 0.5726, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "grad_norm": 2.365931987762451, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "learning_rate": 0.0002711474342917261, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "loss": 0.3825666904449463, + "step": 5390 + }, + { + "ce_loss": 0.13808047771453857, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "distill_loss": 0.12363763153553009, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "ref_ce_loss": 0.12069553881883621, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "loss": 0.35606908798217773, + "step": 5390 + }, + { + "ce_loss": 0.09475722163915634, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "distill_loss": 0.09718462079763412, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "ref_ce_loss": 0.11456798017024994, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "loss": 0.7455524802207947, + "step": 5390 + }, + { + "ce_loss": 0.20426201820373535, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "distill_loss": 0.12498379498720169, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "ref_ce_loss": 0.10539623349905014, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "loss": 0.8537323474884033, + "step": 5390 + }, + { + "ce_loss": 0.25624123215675354, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "distill_loss": 0.13884396851062775, + "epoch": 1.797865243495664, + "step": 5390 + }, + { + "epoch": 1.797865243495664, + "ref_ce_loss": 0.1865921914577484, + "step": 5390 + }, + { + "epoch": 1.8012008005336893, + "loss": 0.556, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "grad_norm": 2.434817314147949, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "learning_rate": 0.0002710278809343015, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "loss": 0.6801935434341431, + "step": 5400 + }, + { + "ce_loss": 0.19641169905662537, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "distill_loss": 0.09645406156778336, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "ref_ce_loss": 0.11136016249656677, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "loss": 0.6349014043807983, + "step": 5400 + }, + { + "ce_loss": 0.2154546082019806, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "distill_loss": 0.111332967877388, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "ref_ce_loss": 0.10834940522909164, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "loss": 0.8095474243164062, + "step": 5400 + }, + { + "ce_loss": 0.13908988237380981, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "distill_loss": 0.11110659688711166, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "ref_ce_loss": 0.08503452688455582, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "loss": 0.9587161540985107, + "step": 5400 + }, + { + "ce_loss": 0.17953532934188843, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "distill_loss": 0.09979195147752762, + "epoch": 1.8012008005336893, + "step": 5400 + }, + { + "epoch": 1.8012008005336893, + "ref_ce_loss": 0.11760571599006653, + "step": 5400 + }, + { + "epoch": 1.8045363575717146, + "loss": 0.6309, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "grad_norm": 2.2750089168548584, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "learning_rate": 0.00027090810686976373, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "loss": 0.7453268766403198, + "step": 5410 + }, + { + "ce_loss": 0.19390563666820526, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "distill_loss": 0.12300623953342438, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "ref_ce_loss": 0.2404060810804367, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "loss": 1.062126636505127, + "step": 5410 + }, + { + "ce_loss": 0.28073927760124207, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "distill_loss": 0.10538649559020996, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "ref_ce_loss": 0.14817595481872559, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "loss": 0.4358749985694885, + "step": 5410 + }, + { + "ce_loss": 0.1794261783361435, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "distill_loss": 0.09952785074710846, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "ref_ce_loss": 0.10201841592788696, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "loss": 0.5710932016372681, + "step": 5410 + }, + { + "ce_loss": 0.1435142308473587, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "distill_loss": 0.07739975303411484, + "epoch": 1.8045363575717146, + "step": 5410 + }, + { + "epoch": 1.8045363575717146, + "ref_ce_loss": 0.1580650359392166, + "step": 5410 + }, + { + "epoch": 1.80787191460974, + "loss": 0.5962, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "grad_norm": 2.5487122535705566, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "learning_rate": 0.0002707881123165334, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "loss": 0.4332432746887207, + "step": 5420 + }, + { + "ce_loss": 0.17312292754650116, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "distill_loss": 0.1059122085571289, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "ref_ce_loss": 0.10186949372291565, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "loss": 0.812820315361023, + "step": 5420 + }, + { + "ce_loss": 0.2344483882188797, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "distill_loss": 0.12185351550579071, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "ref_ce_loss": 0.10016775131225586, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "loss": 0.5691059827804565, + "step": 5420 + }, + { + "ce_loss": 0.19381578266620636, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "distill_loss": 0.12860912084579468, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "ref_ce_loss": 0.10750152915716171, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "loss": 0.4607386589050293, + "step": 5420 + }, + { + "ce_loss": 0.20922338962554932, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "distill_loss": 0.12475057691335678, + "epoch": 1.80787191460974, + "step": 5420 + }, + { + "epoch": 1.80787191460974, + "ref_ce_loss": 0.1266154795885086, + "step": 5420 + }, + { + "epoch": 1.8112074716477653, + "loss": 0.6349, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "grad_norm": 1.9588823318481445, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "learning_rate": 0.00027066789749343324, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "loss": 0.6367460489273071, + "step": 5430 + }, + { + "ce_loss": 0.1723642200231552, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "distill_loss": 0.1507956087589264, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "ref_ce_loss": 0.14595159888267517, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "loss": 0.5201939344406128, + "step": 5430 + }, + { + "ce_loss": 0.13760589063167572, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "distill_loss": 0.10090071707963943, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "ref_ce_loss": 0.15078330039978027, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "loss": 0.6601283550262451, + "step": 5430 + }, + { + "ce_loss": 0.2796492874622345, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "distill_loss": 0.14720816910266876, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "ref_ce_loss": 0.17631027102470398, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "loss": 0.43748265504837036, + "step": 5430 + }, + { + "ce_loss": 0.1258515566587448, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "distill_loss": 0.11924121528863907, + "epoch": 1.8112074716477653, + "step": 5430 + }, + { + "epoch": 1.8112074716477653, + "ref_ce_loss": 0.1339789777994156, + "step": 5430 + }, + { + "epoch": 1.8145430286857906, + "loss": 0.6099, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "grad_norm": 2.69777250289917, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "learning_rate": 0.0002705474626196876, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "loss": 0.5835865139961243, + "step": 5440 + }, + { + "ce_loss": 0.23276305198669434, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "distill_loss": 0.12228725850582123, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "ref_ce_loss": 0.17023053765296936, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "loss": 0.8016778230667114, + "step": 5440 + }, + { + "ce_loss": 0.3399412930011749, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "distill_loss": 0.13676761090755463, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "ref_ce_loss": 0.18872161209583282, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "loss": 0.46767154335975647, + "step": 5440 + }, + { + "ce_loss": 0.1788882166147232, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "distill_loss": 0.13512161374092102, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "ref_ce_loss": 0.09428312629461288, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "loss": 0.9075959920883179, + "step": 5440 + }, + { + "ce_loss": 0.2922513484954834, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "distill_loss": 0.17621180415153503, + "epoch": 1.8145430286857906, + "step": 5440 + }, + { + "epoch": 1.8145430286857906, + "ref_ce_loss": 0.18406786024570465, + "step": 5440 + }, + { + "epoch": 1.817878585723816, + "loss": 0.5886, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "grad_norm": 7.365907192230225, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "learning_rate": 0.0002704268079149223, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "loss": 0.7035877108573914, + "step": 5450 + }, + { + "ce_loss": 0.13215026259422302, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "distill_loss": 0.4339187741279602, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "ref_ce_loss": 0.10032802820205688, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "loss": 0.7162448167800903, + "step": 5450 + }, + { + "ce_loss": 0.18805432319641113, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "distill_loss": 0.37199363112449646, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "ref_ce_loss": 0.12469123303890228, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "loss": 0.5766035914421082, + "step": 5450 + }, + { + "ce_loss": 0.14342515170574188, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "distill_loss": 0.27214252948760986, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "ref_ce_loss": 0.16064272820949554, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "loss": 0.8636009693145752, + "step": 5450 + }, + { + "ce_loss": 0.16702553629875183, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "distill_loss": 0.35656481981277466, + "epoch": 1.817878585723816, + "step": 5450 + }, + { + "epoch": 1.817878585723816, + "ref_ce_loss": 0.1793079525232315, + "step": 5450 + }, + { + "epoch": 1.8212141427618413, + "loss": 0.7028, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "grad_norm": 3.6993865966796875, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "learning_rate": 0.00027030593359916383, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "loss": 0.41134822368621826, + "step": 5460 + }, + { + "ce_loss": 0.1384688913822174, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "distill_loss": 0.1681678742170334, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "ref_ce_loss": 0.10444191843271255, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "loss": 0.6750077605247498, + "step": 5460 + }, + { + "ce_loss": 0.15935517847537994, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "distill_loss": 0.27100151777267456, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "ref_ce_loss": 0.16393183171749115, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "loss": 0.6315606832504272, + "step": 5460 + }, + { + "ce_loss": 0.14299316704273224, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "distill_loss": 0.17834961414337158, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "ref_ce_loss": 0.08231250941753387, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "loss": 0.6965258121490479, + "step": 5460 + }, + { + "ce_loss": 0.31687676906585693, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "distill_loss": 0.20971833169460297, + "epoch": 1.8212141427618413, + "step": 5460 + }, + { + "epoch": 1.8212141427618413, + "ref_ce_loss": 0.16972923278808594, + "step": 5460 + }, + { + "epoch": 1.8245496997998667, + "loss": 0.6912, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "grad_norm": 4.046206474304199, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "learning_rate": 0.0002701848398928393, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "loss": 0.6588386297225952, + "step": 5470 + }, + { + "ce_loss": 0.24764564633369446, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "distill_loss": 0.16088978946208954, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "ref_ce_loss": 0.19515611231327057, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "loss": 0.5033345222473145, + "step": 5470 + }, + { + "ce_loss": 0.22179868817329407, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "distill_loss": 0.16051587462425232, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "ref_ce_loss": 0.08434831351041794, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "loss": 0.6149681210517883, + "step": 5470 + }, + { + "ce_loss": 0.25773754715919495, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "distill_loss": 0.1539229452610016, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "ref_ce_loss": 0.1424413025379181, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "loss": 0.44749850034713745, + "step": 5470 + }, + { + "ce_loss": 0.1712462604045868, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "distill_loss": 0.14961741864681244, + "epoch": 1.8245496997998667, + "step": 5470 + }, + { + "epoch": 1.8245496997998667, + "ref_ce_loss": 0.08178859949111938, + "step": 5470 + }, + { + "epoch": 1.827885256837892, + "loss": 0.5976, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "grad_norm": 2.5611064434051514, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "learning_rate": 0.00027006352701677583, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "loss": 1.1137487888336182, + "step": 5480 + }, + { + "ce_loss": 0.10859647393226624, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "distill_loss": 0.11339569091796875, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "ref_ce_loss": 0.0940646305680275, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "loss": 0.6393406987190247, + "step": 5480 + }, + { + "ce_loss": 0.20108860731124878, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "distill_loss": 0.13915398716926575, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "ref_ce_loss": 0.12006493657827377, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "loss": 0.6720687747001648, + "step": 5480 + }, + { + "ce_loss": 0.23451204597949982, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "distill_loss": 0.19288182258605957, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "ref_ce_loss": 0.17443564534187317, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "loss": 0.5880801677703857, + "step": 5480 + }, + { + "ce_loss": 0.20224609971046448, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "distill_loss": 0.17057499289512634, + "epoch": 1.827885256837892, + "step": 5480 + }, + { + "epoch": 1.827885256837892, + "ref_ce_loss": 0.13367359340190887, + "step": 5480 + }, + { + "epoch": 1.8312208138759174, + "loss": 0.6369, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "grad_norm": 3.403841495513916, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "learning_rate": 0.0002699419951922003, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "loss": 0.5935007333755493, + "step": 5490 + }, + { + "ce_loss": 0.13772651553153992, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "distill_loss": 0.10074299573898315, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "ref_ce_loss": 0.1018882468342781, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "loss": 0.6726405024528503, + "step": 5490 + }, + { + "ce_loss": 0.19621394574642181, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "distill_loss": 0.11780223250389099, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "ref_ce_loss": 0.15497581660747528, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "loss": 1.072763442993164, + "step": 5490 + }, + { + "ce_loss": 0.23977363109588623, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "distill_loss": 0.13159865140914917, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "ref_ce_loss": 0.14965175092220306, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "loss": 0.3933233320713043, + "step": 5490 + }, + { + "ce_loss": 0.16734004020690918, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "distill_loss": 0.11768080294132233, + "epoch": 1.8312208138759174, + "step": 5490 + }, + { + "epoch": 1.8312208138759174, + "ref_ce_loss": 0.10801305621862411, + "step": 5490 + }, + { + "epoch": 1.8345563709139427, + "loss": 0.552, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "grad_norm": 3.3497848510742188, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "learning_rate": 0.0002698202446407388, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "loss": 0.4976787269115448, + "step": 5500 + }, + { + "ce_loss": 0.1326054334640503, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "distill_loss": 0.08439873158931732, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "ref_ce_loss": 0.13565798103809357, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "loss": 0.6742888689041138, + "step": 5500 + }, + { + "ce_loss": 0.2550393342971802, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "distill_loss": 0.12310586869716644, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "ref_ce_loss": 0.2118934988975525, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "loss": 1.2174590826034546, + "step": 5500 + }, + { + "ce_loss": 0.22371555864810944, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "distill_loss": 0.1468636691570282, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "ref_ce_loss": 0.09248271584510803, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "loss": 0.3218141198158264, + "step": 5500 + }, + { + "ce_loss": 0.11735627055168152, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "distill_loss": 0.09057797491550446, + "epoch": 1.8345563709139427, + "step": 5500 + }, + { + "epoch": 1.8345563709139427, + "ref_ce_loss": 0.11361885070800781, + "step": 5500 + }, + { + "epoch": 1.837891927951968, + "loss": 0.5876, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "grad_norm": 3.7035536766052246, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "learning_rate": 0.0002696982755844163, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "loss": 0.48578375577926636, + "step": 5510 + }, + { + "ce_loss": 0.20499315857887268, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "distill_loss": 0.13509148359298706, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "ref_ce_loss": 0.14547577500343323, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "loss": 0.5209768414497375, + "step": 5510 + }, + { + "ce_loss": 0.12706170976161957, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "distill_loss": 0.11170224845409393, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "ref_ce_loss": 0.11799715459346771, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "loss": 0.4726123809814453, + "step": 5510 + }, + { + "ce_loss": 0.13671162724494934, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "distill_loss": 0.1430138349533081, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "ref_ce_loss": 0.14614635705947876, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "loss": 0.4414920210838318, + "step": 5510 + }, + { + "ce_loss": 0.14068323373794556, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "distill_loss": 0.1583692580461502, + "epoch": 1.837891927951968, + "step": 5510 + }, + { + "epoch": 1.837891927951968, + "ref_ce_loss": 0.10938449949026108, + "step": 5510 + }, + { + "epoch": 1.8412274849899934, + "loss": 0.5865, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "grad_norm": 10.9885835647583, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "learning_rate": 0.0002695760882456563, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "loss": 0.49264997243881226, + "step": 5520 + }, + { + "ce_loss": 0.15945468842983246, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "distill_loss": 0.11915487051010132, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "ref_ce_loss": 0.10155269503593445, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "loss": 0.4440663754940033, + "step": 5520 + }, + { + "ce_loss": 0.1484590470790863, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "distill_loss": 0.11816152930259705, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "ref_ce_loss": 0.1403661072254181, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "loss": 0.5174403190612793, + "step": 5520 + }, + { + "ce_loss": 0.12795519828796387, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "distill_loss": 0.12109988927841187, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "ref_ce_loss": 0.09429611265659332, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "loss": 0.6381317377090454, + "step": 5520 + }, + { + "ce_loss": 0.17537306249141693, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "distill_loss": 0.12010416388511658, + "epoch": 1.8412274849899934, + "step": 5520 + }, + { + "epoch": 1.8412274849899934, + "ref_ce_loss": 0.1554737389087677, + "step": 5520 + }, + { + "epoch": 1.8445630420280188, + "loss": 0.588, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "grad_norm": 2.1283841133117676, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "learning_rate": 0.00026945368284728014, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "loss": 0.4387381374835968, + "step": 5530 + }, + { + "ce_loss": 0.20292159914970398, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "distill_loss": 0.10617952048778534, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "ref_ce_loss": 0.12931837141513824, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "loss": 0.7084087133407593, + "step": 5530 + }, + { + "ce_loss": 0.2622057795524597, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "distill_loss": 0.11426561325788498, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "ref_ce_loss": 0.1607864797115326, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "loss": 0.523222804069519, + "step": 5530 + }, + { + "ce_loss": 0.24114282429218292, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "distill_loss": 0.09182117134332657, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "ref_ce_loss": 0.12573231756687164, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "loss": 0.5910396575927734, + "step": 5530 + }, + { + "ce_loss": 0.16739298403263092, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "distill_loss": 0.09659696370363235, + "epoch": 1.8445630420280188, + "step": 5530 + }, + { + "epoch": 1.8445630420280188, + "ref_ce_loss": 0.1586391180753708, + "step": 5530 + }, + { + "epoch": 1.8478985990660441, + "loss": 0.5762, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "grad_norm": 3.0936765670776367, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "learning_rate": 0.0002693310596125072, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "loss": 0.5124553442001343, + "step": 5540 + }, + { + "ce_loss": 0.14690622687339783, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "distill_loss": 0.07775744795799255, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "ref_ce_loss": 0.11736592650413513, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "loss": 0.5300059914588928, + "step": 5540 + }, + { + "ce_loss": 0.20057353377342224, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "distill_loss": 0.09820909053087234, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "ref_ce_loss": 0.18187911808490753, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "loss": 0.4994697570800781, + "step": 5540 + }, + { + "ce_loss": 0.22162176668643951, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "distill_loss": 0.1110476404428482, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "ref_ce_loss": 0.16640129685401917, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "loss": 0.4754143953323364, + "step": 5540 + }, + { + "ce_loss": 0.22003041207790375, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "distill_loss": 0.09422904998064041, + "epoch": 1.8478985990660441, + "step": 5540 + }, + { + "epoch": 1.8478985990660441, + "ref_ce_loss": 0.1313227266073227, + "step": 5540 + }, + { + "epoch": 1.8512341561040695, + "loss": 0.5599, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "grad_norm": 2.980518102645874, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "learning_rate": 0.00026920821876495374, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "loss": 0.8818994760513306, + "step": 5550 + }, + { + "ce_loss": 0.2660450339317322, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "distill_loss": 0.12561841309070587, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "ref_ce_loss": 0.23656994104385376, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "loss": 0.40436068177223206, + "step": 5550 + }, + { + "ce_loss": 0.16543236374855042, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "distill_loss": 0.1111280545592308, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "ref_ce_loss": 0.12749433517456055, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "loss": 0.610494077205658, + "step": 5550 + }, + { + "ce_loss": 0.18998795747756958, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "distill_loss": 0.12114927917718887, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "ref_ce_loss": 0.13568200170993805, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "loss": 0.3950880169868469, + "step": 5550 + }, + { + "ce_loss": 0.1385015845298767, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "distill_loss": 0.10792937129735947, + "epoch": 1.8512341561040695, + "step": 5550 + }, + { + "epoch": 1.8512341561040695, + "ref_ce_loss": 0.09195967018604279, + "step": 5550 + }, + { + "epoch": 1.8545697131420948, + "loss": 0.6336, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "grad_norm": 3.8770010471343994, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "learning_rate": 0.00026908516052863305, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "loss": 0.5988621115684509, + "step": 5560 + }, + { + "ce_loss": 0.275103896856308, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "distill_loss": 0.13239721953868866, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "ref_ce_loss": 0.1453436315059662, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "loss": 0.5324811339378357, + "step": 5560 + }, + { + "ce_loss": 0.20064669847488403, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "distill_loss": 0.16439953446388245, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "ref_ce_loss": 0.12346375733613968, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "loss": 0.9219769239425659, + "step": 5560 + }, + { + "ce_loss": 0.25965479016304016, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "distill_loss": 0.15143707394599915, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "ref_ce_loss": 0.19766446948051453, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "loss": 0.7507549524307251, + "step": 5560 + }, + { + "ce_loss": 0.23712469637393951, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "distill_loss": 0.1717062145471573, + "epoch": 1.8545697131420948, + "step": 5560 + }, + { + "epoch": 1.8545697131420948, + "ref_ce_loss": 0.18241867423057556, + "step": 5560 + }, + { + "epoch": 1.8579052701801202, + "loss": 0.6086, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "grad_norm": 2.5605103969573975, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "learning_rate": 0.0002689618851279549, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "loss": 0.6851871013641357, + "step": 5570 + }, + { + "ce_loss": 0.2842769920825958, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "distill_loss": 0.15424621105194092, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "ref_ce_loss": 0.18643635511398315, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "loss": 0.386451780796051, + "step": 5570 + }, + { + "ce_loss": 0.15897487103939056, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "distill_loss": 0.11713258177042007, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "ref_ce_loss": 0.10942494124174118, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "loss": 0.7646214962005615, + "step": 5570 + }, + { + "ce_loss": 0.18946322798728943, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "distill_loss": 0.15247109532356262, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "ref_ce_loss": 0.12387377768754959, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "loss": 0.3443436920642853, + "step": 5570 + }, + { + "ce_loss": 0.05924952030181885, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "distill_loss": 0.10812821984291077, + "epoch": 1.8579052701801202, + "step": 5570 + }, + { + "epoch": 1.8579052701801202, + "ref_ce_loss": 0.07667309790849686, + "step": 5570 + }, + { + "epoch": 1.8612408272181455, + "loss": 0.5919, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "grad_norm": 4.265673637390137, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "learning_rate": 0.0002688383927877248, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "loss": 0.4172850251197815, + "step": 5580 + }, + { + "ce_loss": 0.1882062405347824, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "distill_loss": 0.09461406618356705, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "ref_ce_loss": 0.09314027428627014, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "loss": 0.6765507459640503, + "step": 5580 + }, + { + "ce_loss": 0.1747109591960907, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "distill_loss": 0.06782689690589905, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "ref_ce_loss": 0.12354877591133118, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "loss": 0.6376481056213379, + "step": 5580 + }, + { + "ce_loss": 0.16509592533111572, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "distill_loss": 0.10020028054714203, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "ref_ce_loss": 0.09290022403001785, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "loss": 0.540982186794281, + "step": 5580 + }, + { + "ce_loss": 0.19303928315639496, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "distill_loss": 0.11706292629241943, + "epoch": 1.8612408272181455, + "step": 5580 + }, + { + "epoch": 1.8612408272181455, + "ref_ce_loss": 0.15270400047302246, + "step": 5580 + }, + { + "epoch": 1.864576384256171, + "loss": 0.5862, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "grad_norm": 2.270423650741577, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "learning_rate": 0.00026871468373314424, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "loss": 0.55879807472229, + "step": 5590 + }, + { + "ce_loss": 0.2468176931142807, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "distill_loss": 0.09950075298547745, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "ref_ce_loss": 0.15365587174892426, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "loss": 0.6020499467849731, + "step": 5590 + }, + { + "ce_loss": 0.18916167318820953, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "distill_loss": 0.11123539507389069, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "ref_ce_loss": 0.214087575674057, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "loss": 0.740506112575531, + "step": 5590 + }, + { + "ce_loss": 0.053918082267045975, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "distill_loss": 0.07949318736791611, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "ref_ce_loss": 0.08161577582359314, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "loss": 0.5570569038391113, + "step": 5590 + }, + { + "ce_loss": 0.2485281080007553, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "distill_loss": 0.13526736199855804, + "epoch": 1.864576384256171, + "step": 5590 + }, + { + "epoch": 1.864576384256171, + "ref_ce_loss": 0.1323298215866089, + "step": 5590 + }, + { + "epoch": 1.8679119412941962, + "loss": 0.612, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "grad_norm": 2.4391157627105713, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "learning_rate": 0.0002685907581898097, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "loss": 0.40929847955703735, + "step": 5600 + }, + { + "ce_loss": 0.145944282412529, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "distill_loss": 0.1584329605102539, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "ref_ce_loss": 0.10474085062742233, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "loss": 0.5263919830322266, + "step": 5600 + }, + { + "ce_loss": 0.1532854288816452, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "distill_loss": 0.13151933252811432, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "ref_ce_loss": 0.14014704525470734, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "loss": 0.509942889213562, + "step": 5600 + }, + { + "ce_loss": 0.1427435576915741, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "distill_loss": 0.13369926810264587, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "ref_ce_loss": 0.1754363775253296, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "loss": 0.4154976010322571, + "step": 5600 + }, + { + "ce_loss": 0.16734033823013306, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "distill_loss": 0.11521608382463455, + "epoch": 1.8679119412941962, + "step": 5600 + }, + { + "epoch": 1.8679119412941962, + "ref_ce_loss": 0.09764881432056427, + "step": 5600 + }, + { + "epoch": 1.8712474983322216, + "loss": 0.6108, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "grad_norm": 2.0599684715270996, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "learning_rate": 0.0002684666163837124, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "loss": 0.45766517519950867, + "step": 5610 + }, + { + "ce_loss": 0.1805308759212494, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "distill_loss": 0.11159662157297134, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "ref_ce_loss": 0.1650044173002243, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "loss": 0.4901134967803955, + "step": 5610 + }, + { + "ce_loss": 0.20846788585186005, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "distill_loss": 0.1440722793340683, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "ref_ce_loss": 0.13724854588508606, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "loss": 0.5946434140205383, + "step": 5610 + }, + { + "ce_loss": 0.2532452642917633, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "distill_loss": 0.16743330657482147, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "ref_ce_loss": 0.10788458585739136, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "loss": 0.8087218403816223, + "step": 5610 + }, + { + "ce_loss": 0.22373901307582855, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "distill_loss": 0.11417116969823837, + "epoch": 1.8712474983322216, + "step": 5610 + }, + { + "epoch": 1.8712474983322216, + "ref_ce_loss": 0.1280953735113144, + "step": 5610 + }, + { + "epoch": 1.874583055370247, + "loss": 0.585, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "grad_norm": 2.23083758354187, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "learning_rate": 0.0002683422585412381, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "loss": 0.41577935218811035, + "step": 5620 + }, + { + "ce_loss": 0.15030327439308167, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "distill_loss": 0.11963702738285065, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "ref_ce_loss": 0.09877686202526093, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "loss": 0.4805477559566498, + "step": 5620 + }, + { + "ce_loss": 0.19180341064929962, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "distill_loss": 0.10885943472385406, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "ref_ce_loss": 0.11150091886520386, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "loss": 0.9445975422859192, + "step": 5620 + }, + { + "ce_loss": 0.1629776507616043, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "distill_loss": 0.12794573605060577, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "ref_ce_loss": 0.13277025520801544, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "loss": 0.9632389545440674, + "step": 5620 + }, + { + "ce_loss": 0.30103176832199097, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "distill_loss": 0.15930506587028503, + "epoch": 1.874583055370247, + "step": 5620 + }, + { + "epoch": 1.874583055370247, + "ref_ce_loss": 0.15233488380908966, + "step": 5620 + }, + { + "epoch": 1.8779186124082723, + "loss": 0.6478, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "grad_norm": 2.3699145317077637, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "learning_rate": 0.00026821768488916644, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "loss": 0.551582932472229, + "step": 5630 + }, + { + "ce_loss": 0.09384066611528397, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "distill_loss": 0.06880702078342438, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "ref_ce_loss": 0.09983465075492859, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "loss": 0.856539249420166, + "step": 5630 + }, + { + "ce_loss": 0.2709778845310211, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "distill_loss": 0.15140986442565918, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "ref_ce_loss": 0.15218645334243774, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "loss": 0.26097142696380615, + "step": 5630 + }, + { + "ce_loss": 0.06970866024494171, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "distill_loss": 0.09343140572309494, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "ref_ce_loss": 0.0976763591170311, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "loss": 0.6135655045509338, + "step": 5630 + }, + { + "ce_loss": 0.33973413705825806, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "distill_loss": 0.13375625014305115, + "epoch": 1.8779186124082723, + "step": 5630 + }, + { + "epoch": 1.8779186124082723, + "ref_ce_loss": 0.13988807797431946, + "step": 5630 + }, + { + "epoch": 1.8812541694462976, + "loss": 0.5563, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "grad_norm": 2.838254451751709, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "learning_rate": 0.0002680928956546706, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "loss": 0.5370069146156311, + "step": 5640 + }, + { + "ce_loss": 0.2182268649339676, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "distill_loss": 0.13402530550956726, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "ref_ce_loss": 0.11913847923278809, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "loss": 0.6514466404914856, + "step": 5640 + }, + { + "ce_loss": 0.2067224681377411, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "distill_loss": 0.1199726015329361, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "ref_ce_loss": 0.1521792709827423, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "loss": 0.4596925377845764, + "step": 5640 + }, + { + "ce_loss": 0.16043129563331604, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "distill_loss": 0.10075315088033676, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "ref_ce_loss": 0.15545539557933807, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "loss": 0.6942105889320374, + "step": 5640 + }, + { + "ce_loss": 0.2646586298942566, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "distill_loss": 0.10686776787042618, + "epoch": 1.8812541694462976, + "step": 5640 + }, + { + "epoch": 1.8812541694462976, + "ref_ce_loss": 0.14801819622516632, + "step": 5640 + }, + { + "epoch": 1.884589726484323, + "loss": 0.5193, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "grad_norm": 2.476203203201294, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "learning_rate": 0.00026796789106531694, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "loss": 0.5218818783760071, + "step": 5650 + }, + { + "ce_loss": 0.2017936408519745, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "distill_loss": 0.09580246359109879, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "ref_ce_loss": 0.15740014612674713, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "loss": 0.7491849064826965, + "step": 5650 + }, + { + "ce_loss": 0.3006199896335602, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "distill_loss": 0.14339643716812134, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "ref_ce_loss": 0.14143864810466766, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "loss": 0.5106698274612427, + "step": 5650 + }, + { + "ce_loss": 0.22800149023532867, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "distill_loss": 0.10951074957847595, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "ref_ce_loss": 0.17092949151992798, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "loss": 0.35551875829696655, + "step": 5650 + }, + { + "ce_loss": 0.09727875888347626, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "distill_loss": 0.0897751897573471, + "epoch": 1.884589726484323, + "step": 5650 + }, + { + "epoch": 1.884589726484323, + "ref_ce_loss": 0.11061455309391022, + "step": 5650 + }, + { + "epoch": 1.8879252835223483, + "loss": 0.5365, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "grad_norm": 2.804887533187866, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "learning_rate": 0.0002678426713490645, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "loss": 0.7548896670341492, + "step": 5660 + }, + { + "ce_loss": 0.2367536723613739, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "distill_loss": 0.08415091782808304, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "ref_ce_loss": 0.13766418397426605, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "loss": 0.35744622349739075, + "step": 5660 + }, + { + "ce_loss": 0.15886476635932922, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "distill_loss": 0.06705141812562943, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "ref_ce_loss": 0.13137438893318176, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "loss": 0.49181342124938965, + "step": 5660 + }, + { + "ce_loss": 0.18229323625564575, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "distill_loss": 0.06833770871162415, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "ref_ce_loss": 0.12497011572122574, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "loss": 0.45327556133270264, + "step": 5660 + }, + { + "ce_loss": 0.14888149499893188, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "distill_loss": 0.07899749279022217, + "epoch": 1.8879252835223483, + "step": 5660 + }, + { + "epoch": 1.8879252835223483, + "ref_ce_loss": 0.12956686317920685, + "step": 5660 + }, + { + "epoch": 1.8912608405603737, + "loss": 0.6139, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "grad_norm": 3.510312557220459, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "learning_rate": 0.0002677172367342646, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "loss": 0.6080343127250671, + "step": 5670 + }, + { + "ce_loss": 0.18910494446754456, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "distill_loss": 0.10964290797710419, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "ref_ce_loss": 0.19770650565624237, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "loss": 0.3534941077232361, + "step": 5670 + }, + { + "ce_loss": 0.1675875186920166, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "distill_loss": 0.10437571257352829, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "ref_ce_loss": 0.08139434456825256, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "loss": 0.34793180227279663, + "step": 5670 + }, + { + "ce_loss": 0.11697270721197128, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "distill_loss": 0.09839901328086853, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "ref_ce_loss": 0.13236558437347412, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "loss": 0.9213211536407471, + "step": 5670 + }, + { + "ce_loss": 0.31074386835098267, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "distill_loss": 0.19469662010669708, + "epoch": 1.8912608405603737, + "step": 5670 + }, + { + "epoch": 1.8912608405603737, + "ref_ce_loss": 0.14483579993247986, + "step": 5670 + }, + { + "epoch": 1.894596397598399, + "loss": 0.6004, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "grad_norm": 2.557516098022461, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "learning_rate": 0.00026759158744966066, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "loss": 0.41085487604141235, + "step": 5680 + }, + { + "ce_loss": 0.14269503951072693, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "distill_loss": 0.12160893529653549, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "ref_ce_loss": 0.10581735521554947, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "loss": 0.49807649850845337, + "step": 5680 + }, + { + "ce_loss": 0.14981544017791748, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "distill_loss": 0.17303360998630524, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "ref_ce_loss": 0.11994165182113647, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "loss": 0.7087249159812927, + "step": 5680 + }, + { + "ce_loss": 0.20051364600658417, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "distill_loss": 0.2588937282562256, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "ref_ce_loss": 0.14541538059711456, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "loss": 0.4271368384361267, + "step": 5680 + }, + { + "ce_loss": 0.16366565227508545, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "distill_loss": 0.1520344465970993, + "epoch": 1.894596397598399, + "step": 5680 + }, + { + "epoch": 1.894596397598399, + "ref_ce_loss": 0.11106879264116287, + "step": 5680 + }, + { + "epoch": 1.8979319546364244, + "loss": 0.6308, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "grad_norm": 2.758974075317383, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "learning_rate": 0.0002674657237243873, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "loss": 0.5080286264419556, + "step": 5690 + }, + { + "ce_loss": 0.13973647356033325, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "distill_loss": 0.21855957806110382, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "ref_ce_loss": 0.10131456702947617, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "loss": 0.7637765407562256, + "step": 5690 + }, + { + "ce_loss": 0.1351289600133896, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "distill_loss": 0.14129924774169922, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "ref_ce_loss": 0.06359421461820602, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "loss": 0.5765724182128906, + "step": 5690 + }, + { + "ce_loss": 0.24543757736682892, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "distill_loss": 0.1919766515493393, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "ref_ce_loss": 0.13904628157615662, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "loss": 0.4055544435977936, + "step": 5690 + }, + { + "ce_loss": 0.07628988474607468, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "distill_loss": 0.1454552859067917, + "epoch": 1.8979319546364244, + "step": 5690 + }, + { + "epoch": 1.8979319546364244, + "ref_ce_loss": 0.13771748542785645, + "step": 5690 + }, + { + "epoch": 1.9012675116744497, + "loss": 0.6209, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "grad_norm": 5.222294807434082, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "learning_rate": 0.0002673396457879703, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "loss": 0.7928350567817688, + "step": 5700 + }, + { + "ce_loss": 0.18566617369651794, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "distill_loss": 0.1641082614660263, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "ref_ce_loss": 0.11963324248790741, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "loss": 0.9268467426300049, + "step": 5700 + }, + { + "ce_loss": 0.2279396653175354, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "distill_loss": 0.17557619512081146, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "ref_ce_loss": 0.18660259246826172, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "loss": 0.3865606188774109, + "step": 5700 + }, + { + "ce_loss": 0.1271536648273468, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "distill_loss": 0.1433785855770111, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "ref_ce_loss": 0.11587008833885193, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "loss": 0.5936480760574341, + "step": 5700 + }, + { + "ce_loss": 0.19039350748062134, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "distill_loss": 0.16909794509410858, + "epoch": 1.9012675116744497, + "step": 5700 + }, + { + "epoch": 1.9012675116744497, + "ref_ce_loss": 0.10260917991399765, + "step": 5700 + }, + { + "epoch": 1.904603068712475, + "loss": 0.6287, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "grad_norm": 3.323878526687622, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "learning_rate": 0.00026721335387032603, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "loss": 0.5773212909698486, + "step": 5710 + }, + { + "ce_loss": 0.20355701446533203, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "distill_loss": 0.13835524022579193, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "ref_ce_loss": 0.13054907321929932, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "loss": 0.6119946241378784, + "step": 5710 + }, + { + "ce_loss": 0.08676505833864212, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "distill_loss": 0.12342211604118347, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "ref_ce_loss": 0.15018554031848907, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "loss": 0.4971517324447632, + "step": 5710 + }, + { + "ce_loss": 0.11750080436468124, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "distill_loss": 0.09639900177717209, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "ref_ce_loss": 0.13823343813419342, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "loss": 0.5045558214187622, + "step": 5710 + }, + { + "ce_loss": 0.10029848664999008, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "distill_loss": 0.11299914866685867, + "epoch": 1.904603068712475, + "step": 5710 + }, + { + "epoch": 1.904603068712475, + "ref_ce_loss": 0.10818342119455338, + "step": 5710 + }, + { + "epoch": 1.9079386257505004, + "loss": 0.5606, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "grad_norm": 4.714890003204346, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "learning_rate": 0.0002670868482017613, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "loss": 0.5226252675056458, + "step": 5720 + }, + { + "ce_loss": 0.2095170021057129, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "distill_loss": 0.11923301219940186, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "ref_ce_loss": 0.16285623610019684, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "loss": 0.2856457829475403, + "step": 5720 + }, + { + "ce_loss": 0.09019511193037033, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "distill_loss": 0.10442131757736206, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "ref_ce_loss": 0.09080447256565094, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "loss": 1.5851621627807617, + "step": 5720 + }, + { + "ce_loss": 0.34080591797828674, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "distill_loss": 0.1635332554578781, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "ref_ce_loss": 0.24340665340423584, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "loss": 0.760962963104248, + "step": 5720 + }, + { + "ce_loss": 0.33830568194389343, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "distill_loss": 0.18809860944747925, + "epoch": 1.9079386257505004, + "step": 5720 + }, + { + "epoch": 1.9079386257505004, + "ref_ce_loss": 0.19881245493888855, + "step": 5720 + }, + { + "epoch": 1.9112741827885258, + "loss": 0.691, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "grad_norm": 1.771593451499939, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "learning_rate": 0.0002669601290129724, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "loss": 1.1627140045166016, + "step": 5730 + }, + { + "ce_loss": 0.25697022676467896, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "distill_loss": 0.12009838223457336, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "ref_ce_loss": 0.1639402210712433, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "loss": 0.9581549167633057, + "step": 5730 + }, + { + "ce_loss": 0.19211870431900024, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "distill_loss": 0.14240829646587372, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "ref_ce_loss": 0.13938498497009277, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "loss": 0.6060691475868225, + "step": 5730 + }, + { + "ce_loss": 0.2377042919397354, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "distill_loss": 0.128531351685524, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "ref_ce_loss": 0.17777924239635468, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "loss": 0.5381349921226501, + "step": 5730 + }, + { + "ce_loss": 0.2451322376728058, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "distill_loss": 0.11923262476921082, + "epoch": 1.9112741827885258, + "step": 5730 + }, + { + "epoch": 1.9112741827885258, + "ref_ce_loss": 0.13032066822052002, + "step": 5730 + }, + { + "epoch": 1.9146097398265511, + "loss": 0.6032, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "grad_norm": 3.885390281677246, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "learning_rate": 0.00026683319653504514, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "loss": 0.662714958190918, + "step": 5740 + }, + { + "ce_loss": 0.1848512589931488, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "distill_loss": 0.11129119992256165, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "ref_ce_loss": 0.13736315071582794, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "loss": 0.7138192653656006, + "step": 5740 + }, + { + "ce_loss": 0.14085586369037628, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "distill_loss": 0.09094604104757309, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "ref_ce_loss": 0.11834074556827545, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "loss": 0.6987002491950989, + "step": 5740 + }, + { + "ce_loss": 0.1906113177537918, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "distill_loss": 0.11060132831335068, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "ref_ce_loss": 0.16047611832618713, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "loss": 0.8356293439865112, + "step": 5740 + }, + { + "ce_loss": 0.20562797784805298, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "distill_loss": 0.09776540845632553, + "epoch": 1.9146097398265511, + "step": 5740 + }, + { + "epoch": 1.9146097398265511, + "ref_ce_loss": 0.1992952972650528, + "step": 5740 + }, + { + "epoch": 1.9179452968645765, + "loss": 0.556, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "grad_norm": 2.8669350147247314, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "learning_rate": 0.0002667060509994544, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "loss": 0.4409118890762329, + "step": 5750 + }, + { + "ce_loss": 0.14581716060638428, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "distill_loss": 0.10838485509157181, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "ref_ce_loss": 0.11615348607301712, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "loss": 0.6378897428512573, + "step": 5750 + }, + { + "ce_loss": 0.20877128839492798, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "distill_loss": 0.1472756564617157, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "ref_ce_loss": 0.12192574888467789, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "loss": 0.5242109298706055, + "step": 5750 + }, + { + "ce_loss": 0.20813210308551788, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "distill_loss": 0.1383836269378662, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "ref_ce_loss": 0.11328185349702835, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "loss": 0.34768980741500854, + "step": 5750 + }, + { + "ce_loss": 0.09965898841619492, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "distill_loss": 0.0946233868598938, + "epoch": 1.9179452968645765, + "step": 5750 + }, + { + "epoch": 1.9179452968645765, + "ref_ce_loss": 0.10129421949386597, + "step": 5750 + }, + { + "epoch": 1.9212808539026018, + "loss": 0.5762, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "grad_norm": 1.8706265687942505, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "learning_rate": 0.0002665786926380634, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "loss": 0.3974435329437256, + "step": 5760 + }, + { + "ce_loss": 0.11973472684621811, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "distill_loss": 0.10423251986503601, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "ref_ce_loss": 0.11439381539821625, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "loss": 0.6238061785697937, + "step": 5760 + }, + { + "ce_loss": 0.2051391750574112, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "distill_loss": 0.13982662558555603, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "ref_ce_loss": 0.14438097178936005, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "loss": 0.7971160411834717, + "step": 5760 + }, + { + "ce_loss": 0.280038446187973, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "distill_loss": 0.11909227073192596, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "ref_ce_loss": 0.1876353621482849, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "loss": 0.7649089694023132, + "step": 5760 + }, + { + "ce_loss": 0.3388735055923462, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "distill_loss": 0.16161924600601196, + "epoch": 1.9212808539026018, + "step": 5760 + }, + { + "epoch": 1.9212808539026018, + "ref_ce_loss": 0.20831725001335144, + "step": 5760 + }, + { + "epoch": 1.9246164109406272, + "loss": 0.6259, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "grad_norm": 2.445451498031616, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "learning_rate": 0.0002664511216831235, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "loss": 0.5543986558914185, + "step": 5770 + }, + { + "ce_loss": 0.25721198320388794, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "distill_loss": 0.14469127357006073, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "ref_ce_loss": 0.11064379662275314, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "loss": 0.42987683415412903, + "step": 5770 + }, + { + "ce_loss": 0.19463425874710083, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "distill_loss": 0.10757134109735489, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "ref_ce_loss": 0.10230542719364166, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "loss": 0.44369128346443176, + "step": 5770 + }, + { + "ce_loss": 0.16867774724960327, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "distill_loss": 0.11673156917095184, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "ref_ce_loss": 0.11092256754636765, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "loss": 0.9410002827644348, + "step": 5770 + }, + { + "ce_loss": 0.24478532373905182, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "distill_loss": 0.14264214038848877, + "epoch": 1.9246164109406272, + "step": 5770 + }, + { + "epoch": 1.9246164109406272, + "ref_ce_loss": 0.1397172510623932, + "step": 5770 + }, + { + "epoch": 1.9279519679786525, + "loss": 0.5818, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "grad_norm": 2.4682679176330566, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "learning_rate": 0.000266323338367274, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "loss": 0.45416784286499023, + "step": 5780 + }, + { + "ce_loss": 0.11269327998161316, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "distill_loss": 0.11082468926906586, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "ref_ce_loss": 0.09867662936449051, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "loss": 0.4797813296318054, + "step": 5780 + }, + { + "ce_loss": 0.13305124640464783, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "distill_loss": 0.14016100764274597, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "ref_ce_loss": 0.10984981060028076, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "loss": 0.4248768091201782, + "step": 5780 + }, + { + "ce_loss": 0.14768195152282715, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "distill_loss": 0.11138048022985458, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "ref_ce_loss": 0.11871366202831268, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "loss": 0.5333288311958313, + "step": 5780 + }, + { + "ce_loss": 0.1860743910074234, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "distill_loss": 0.13695748150348663, + "epoch": 1.9279519679786525, + "step": 5780 + }, + { + "epoch": 1.9279519679786525, + "ref_ce_loss": 0.11091623455286026, + "step": 5780 + }, + { + "epoch": 1.9312875250166779, + "loss": 0.5371, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "grad_norm": 3.2531745433807373, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "learning_rate": 0.000266195342923541, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "loss": 0.36490774154663086, + "step": 5790 + }, + { + "ce_loss": 0.10847225040197372, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "distill_loss": 0.07613347470760345, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "ref_ce_loss": 0.1256321221590042, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "loss": 0.3799107074737549, + "step": 5790 + }, + { + "ce_loss": 0.17202770709991455, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "distill_loss": 0.12261956185102463, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "ref_ce_loss": 0.08407846093177795, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "loss": 0.4411643445491791, + "step": 5790 + }, + { + "ce_loss": 0.08297307789325714, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "distill_loss": 0.07652588933706284, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "ref_ce_loss": 0.12075881659984589, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "loss": 0.5024446845054626, + "step": 5790 + }, + { + "ce_loss": 0.16614662110805511, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "distill_loss": 0.10244978964328766, + "epoch": 1.9312875250166779, + "step": 5790 + }, + { + "epoch": 1.9312875250166779, + "ref_ce_loss": 0.13249625265598297, + "step": 5790 + }, + { + "epoch": 1.9346230820547032, + "loss": 0.6017, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "grad_norm": 2.583327054977417, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "learning_rate": 0.0002660671355853379, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "loss": 0.8501856327056885, + "step": 5800 + }, + { + "ce_loss": 0.2730255424976349, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "distill_loss": 0.13773846626281738, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "ref_ce_loss": 0.11032268404960632, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "loss": 0.3907512426376343, + "step": 5800 + }, + { + "ce_loss": 0.1428876668214798, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "distill_loss": 0.1206461638212204, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "ref_ce_loss": 0.12690801918506622, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "loss": 1.1583483219146729, + "step": 5800 + }, + { + "ce_loss": 0.3016435205936432, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "distill_loss": 0.1703663468360901, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "ref_ce_loss": 0.12071062624454498, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "loss": 0.7512863874435425, + "step": 5800 + }, + { + "ce_loss": 0.3120938241481781, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "distill_loss": 0.1428443342447281, + "epoch": 1.9346230820547032, + "step": 5800 + }, + { + "epoch": 1.9346230820547032, + "ref_ce_loss": 0.2358296513557434, + "step": 5800 + }, + { + "epoch": 1.9379586390927286, + "loss": 0.6324, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "grad_norm": 4.115167617797852, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "learning_rate": 0.0002659387165864642, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "loss": 0.48370733857154846, + "step": 5810 + }, + { + "ce_loss": 0.2041088342666626, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "distill_loss": 0.11525950580835342, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "ref_ce_loss": 0.10774882882833481, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "loss": 0.39353325963020325, + "step": 5810 + }, + { + "ce_loss": 0.09266683459281921, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "distill_loss": 0.10596026480197906, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "ref_ce_loss": 0.12899544835090637, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "loss": 0.5259385108947754, + "step": 5810 + }, + { + "ce_loss": 0.17804045975208282, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "distill_loss": 0.12347061187028885, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "ref_ce_loss": 0.08819334208965302, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "loss": 0.5755244493484497, + "step": 5810 + }, + { + "ce_loss": 0.22214099764823914, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "distill_loss": 0.0878373458981514, + "epoch": 1.9379586390927286, + "step": 5810 + }, + { + "epoch": 1.9379586390927286, + "ref_ce_loss": 0.13193178176879883, + "step": 5810 + }, + { + "epoch": 1.941294196130754, + "loss": 0.5549, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "grad_norm": 3.8637335300445557, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "learning_rate": 0.0002658100861611056, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "loss": 0.40220752358436584, + "step": 5820 + }, + { + "ce_loss": 0.09550049155950546, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "distill_loss": 0.08620162308216095, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "ref_ce_loss": 0.12067662179470062, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "loss": 0.3592372536659241, + "step": 5820 + }, + { + "ce_loss": 0.1292707771062851, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "distill_loss": 0.09745273739099503, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "ref_ce_loss": 0.09250719100236893, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "loss": 0.6222105622291565, + "step": 5820 + }, + { + "ce_loss": 0.17958803474903107, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "distill_loss": 0.10309149324893951, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "ref_ce_loss": 0.16316074132919312, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "loss": 0.8499335646629333, + "step": 5820 + }, + { + "ce_loss": 0.19285449385643005, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "distill_loss": 0.07820558547973633, + "epoch": 1.941294196130754, + "step": 5820 + }, + { + "epoch": 1.941294196130754, + "ref_ce_loss": 0.1462705433368683, + "step": 5820 + }, + { + "epoch": 1.9446297531687793, + "loss": 0.5729, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "grad_norm": 1.8970744609832764, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "learning_rate": 0.0002656812445438332, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "loss": 0.5597825050354004, + "step": 5830 + }, + { + "ce_loss": 0.232600137591362, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "distill_loss": 0.1474754512310028, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "ref_ce_loss": 0.09885487705469131, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "loss": 0.43907564878463745, + "step": 5830 + }, + { + "ce_loss": 0.18678975105285645, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "distill_loss": 0.10709083825349808, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "ref_ce_loss": 0.10085967928171158, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "loss": 0.43083420395851135, + "step": 5830 + }, + { + "ce_loss": 0.1751118153333664, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "distill_loss": 0.12244075536727905, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "ref_ce_loss": 0.08682743459939957, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "loss": 0.7343491911888123, + "step": 5830 + }, + { + "ce_loss": 0.2072955071926117, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "distill_loss": 0.10877859592437744, + "epoch": 1.9446297531687793, + "step": 5830 + }, + { + "epoch": 1.9446297531687793, + "ref_ce_loss": 0.14639054238796234, + "step": 5830 + }, + { + "epoch": 1.9479653102068046, + "loss": 0.5426, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "grad_norm": 1.6072348356246948, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "learning_rate": 0.0002655521919696032, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "loss": 0.8225134611129761, + "step": 5840 + }, + { + "ce_loss": 0.31686344742774963, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "distill_loss": 0.14142484962940216, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "ref_ce_loss": 0.14611388742923737, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "loss": 0.36344388127326965, + "step": 5840 + }, + { + "ce_loss": 0.1438729166984558, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "distill_loss": 0.12257090955972672, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "ref_ce_loss": 0.09651162475347519, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "loss": 0.5657672882080078, + "step": 5840 + }, + { + "ce_loss": 0.14495164155960083, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "distill_loss": 0.09493519365787506, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "ref_ce_loss": 0.10763535648584366, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "loss": 0.49626821279525757, + "step": 5840 + }, + { + "ce_loss": 0.25028860569000244, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "distill_loss": 0.1078210175037384, + "epoch": 1.9479653102068046, + "step": 5840 + }, + { + "epoch": 1.9479653102068046, + "ref_ce_loss": 0.13762708008289337, + "step": 5840 + }, + { + "epoch": 1.95130086724483, + "loss": 0.6354, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "grad_norm": 3.0269176959991455, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "learning_rate": 0.0002654229286737567, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "loss": 0.512065589427948, + "step": 5850 + }, + { + "ce_loss": 0.1600334197282791, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "distill_loss": 0.09044967591762543, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "ref_ce_loss": 0.11078616976737976, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "loss": 0.7213431596755981, + "step": 5850 + }, + { + "ce_loss": 0.16717001795768738, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "distill_loss": 0.12851962447166443, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "ref_ce_loss": 0.1945911943912506, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "loss": 0.5515893697738647, + "step": 5850 + }, + { + "ce_loss": 0.10928317159414291, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "distill_loss": 0.09205825626850128, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "ref_ce_loss": 0.11985401064157486, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "loss": 0.8128691911697388, + "step": 5850 + }, + { + "ce_loss": 0.15571244060993195, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "distill_loss": 0.12451403588056564, + "epoch": 1.95130086724483, + "step": 5850 + }, + { + "epoch": 1.95130086724483, + "ref_ce_loss": 0.10983094573020935, + "step": 5850 + }, + { + "epoch": 1.9546364242828553, + "loss": 0.7005, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "grad_norm": 3.6902854442596436, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "learning_rate": 0.00026529345489201896, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "loss": 0.6848353147506714, + "step": 5860 + }, + { + "ce_loss": 0.14601679146289825, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "distill_loss": 0.11411919444799423, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "ref_ce_loss": 0.12418550252914429, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "loss": 0.46468979120254517, + "step": 5860 + }, + { + "ce_loss": 0.21989013254642487, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "distill_loss": 0.11546780169010162, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "ref_ce_loss": 0.12887084484100342, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "loss": 0.6564695835113525, + "step": 5860 + }, + { + "ce_loss": 0.3337359130382538, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "distill_loss": 0.13268591463565826, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "ref_ce_loss": 0.1438097208738327, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "loss": 0.9603589177131653, + "step": 5860 + }, + { + "ce_loss": 0.13805945217609406, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "distill_loss": 0.12165968120098114, + "epoch": 1.9546364242828553, + "step": 5860 + }, + { + "epoch": 1.9546364242828553, + "ref_ce_loss": 0.12685857713222504, + "step": 5860 + }, + { + "epoch": 1.9579719813208807, + "loss": 0.5612, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "grad_norm": 2.58427095413208, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "learning_rate": 0.000265163770860499, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "loss": 0.42864927649497986, + "step": 5870 + }, + { + "ce_loss": 0.14903698861598969, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "distill_loss": 0.12617306411266327, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "ref_ce_loss": 0.10953165590763092, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "loss": 0.4737485349178314, + "step": 5870 + }, + { + "ce_loss": 0.19709208607673645, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "distill_loss": 0.11354006826877594, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "ref_ce_loss": 0.1605563461780548, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "loss": 0.7547450065612793, + "step": 5870 + }, + { + "ce_loss": 0.19148224592208862, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "distill_loss": 0.1140814945101738, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "ref_ce_loss": 0.14812114834785461, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "loss": 0.4125843048095703, + "step": 5870 + }, + { + "ce_loss": 0.1644628345966339, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "distill_loss": 0.08505094796419144, + "epoch": 1.9579719813208807, + "step": 5870 + }, + { + "epoch": 1.9579719813208807, + "ref_ce_loss": 0.10612877458333969, + "step": 5870 + }, + { + "epoch": 1.961307538358906, + "loss": 0.5927, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "grad_norm": 2.976412057876587, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "learning_rate": 0.0002650338768156894, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "loss": 0.6274206638336182, + "step": 5880 + }, + { + "ce_loss": 0.250379353761673, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "distill_loss": 0.11398634314537048, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "ref_ce_loss": 0.14656729996204376, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "loss": 0.6905542612075806, + "step": 5880 + }, + { + "ce_loss": 0.22709456086158752, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "distill_loss": 0.11130400002002716, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "ref_ce_loss": 0.07458300143480301, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "loss": 0.6171120405197144, + "step": 5880 + }, + { + "ce_loss": 0.11764512956142426, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "distill_loss": 0.10441337525844574, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "ref_ce_loss": 0.1329973042011261, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "loss": 0.3864017128944397, + "step": 5880 + }, + { + "ce_loss": 0.1463531106710434, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "distill_loss": 0.08627346903085709, + "epoch": 1.961307538358906, + "step": 5880 + }, + { + "epoch": 1.961307538358906, + "ref_ce_loss": 0.09403983503580093, + "step": 5880 + }, + { + "epoch": 1.9646430953969314, + "loss": 0.5942, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "grad_norm": 3.624955177307129, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "learning_rate": 0.0002649037729944657, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "loss": 0.8614783883094788, + "step": 5890 + }, + { + "ce_loss": 0.23180143535137177, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "distill_loss": 0.10813884437084198, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "ref_ce_loss": 0.16049298644065857, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "loss": 0.6531976461410522, + "step": 5890 + }, + { + "ce_loss": 0.22142180800437927, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "distill_loss": 0.11609365046024323, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "ref_ce_loss": 0.2098771631717682, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "loss": 0.7346011996269226, + "step": 5890 + }, + { + "ce_loss": 0.22203612327575684, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "distill_loss": 0.13322067260742188, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "ref_ce_loss": 0.11884419620037079, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "loss": 0.3451249301433563, + "step": 5890 + }, + { + "ce_loss": 0.09568674117326736, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "distill_loss": 0.11562363058328629, + "epoch": 1.9646430953969314, + "step": 5890 + }, + { + "epoch": 1.9646430953969314, + "ref_ce_loss": 0.13373398780822754, + "step": 5890 + }, + { + "epoch": 1.9679786524349567, + "loss": 0.6328, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "grad_norm": 3.982816457748413, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "learning_rate": 0.0002647734596340859, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "loss": 0.5729438066482544, + "step": 5900 + }, + { + "ce_loss": 0.20175018906593323, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "distill_loss": 0.10930713266134262, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "ref_ce_loss": 0.11769817024469376, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "loss": 0.2672497034072876, + "step": 5900 + }, + { + "ce_loss": 0.10306866466999054, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "distill_loss": 0.09915365278720856, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "ref_ce_loss": 0.06501106917858124, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "loss": 0.5962284803390503, + "step": 5900 + }, + { + "ce_loss": 0.2861151099205017, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "distill_loss": 0.12377713620662689, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "ref_ce_loss": 0.18407902121543884, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "loss": 0.5468906164169312, + "step": 5900 + }, + { + "ce_loss": 0.17363518476486206, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "distill_loss": 0.11055031418800354, + "epoch": 1.9679786524349567, + "step": 5900 + }, + { + "epoch": 1.9679786524349567, + "ref_ce_loss": 0.10016216337680817, + "step": 5900 + }, + { + "epoch": 1.971314209472982, + "loss": 0.5764, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "grad_norm": 3.0539867877960205, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "learning_rate": 0.00026464293697219015, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "loss": 0.5851742029190063, + "step": 5910 + }, + { + "ce_loss": 0.24227118492126465, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "distill_loss": 0.11767081916332245, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "ref_ce_loss": 0.1285375952720642, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "loss": 0.5376459956169128, + "step": 5910 + }, + { + "ce_loss": 0.1833650916814804, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "distill_loss": 0.12777496874332428, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "ref_ce_loss": 0.09195635467767715, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "loss": 0.4259001910686493, + "step": 5910 + }, + { + "ce_loss": 0.16761203110218048, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "distill_loss": 0.1049601286649704, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "ref_ce_loss": 0.08499225974082947, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "loss": 0.46280694007873535, + "step": 5910 + }, + { + "ce_loss": 0.17035450041294098, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "distill_loss": 0.100443035364151, + "epoch": 1.971314209472982, + "step": 5910 + }, + { + "epoch": 1.971314209472982, + "ref_ce_loss": 0.19197778403759003, + "step": 5910 + }, + { + "epoch": 1.9746497665110074, + "loss": 0.5723, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "grad_norm": 3.073291063308716, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "learning_rate": 0.00026451220524680025, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "loss": 0.5005868077278137, + "step": 5920 + }, + { + "ce_loss": 0.1469360888004303, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "distill_loss": 0.11416827142238617, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "ref_ce_loss": 0.1178518682718277, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "loss": 0.3899126648902893, + "step": 5920 + }, + { + "ce_loss": 0.12613821029663086, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "distill_loss": 0.09031940251588821, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "ref_ce_loss": 0.12185925245285034, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "loss": 0.6879602074623108, + "step": 5920 + }, + { + "ce_loss": 0.24272885918617249, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "distill_loss": 0.1410408467054367, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "ref_ce_loss": 0.13041582703590393, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "loss": 0.5270417332649231, + "step": 5920 + }, + { + "ce_loss": 0.18191879987716675, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "distill_loss": 0.11637015640735626, + "epoch": 1.9746497665110074, + "step": 5920 + }, + { + "epoch": 1.9746497665110074, + "ref_ce_loss": 0.09732194989919662, + "step": 5920 + }, + { + "epoch": 1.9779853235490328, + "loss": 0.609, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "grad_norm": 2.8188343048095703, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "learning_rate": 0.0002643812646963194, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "loss": 2.293231725692749, + "step": 5930 + }, + { + "ce_loss": 1.2716293334960938, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "distill_loss": 0.10453981161117554, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "ref_ce_loss": 0.5263649821281433, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "loss": 2.203127861022949, + "step": 5930 + }, + { + "ce_loss": 1.4657419919967651, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "distill_loss": 0.09050406515598297, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "ref_ce_loss": 0.5690605640411377, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "loss": 2.1848506927490234, + "step": 5930 + }, + { + "ce_loss": 1.1554293632507324, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "distill_loss": 0.11221285909414291, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "ref_ce_loss": 0.7525448203086853, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "loss": 1.8322076797485352, + "step": 5930 + }, + { + "ce_loss": 1.1817811727523804, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "distill_loss": 0.11352679133415222, + "epoch": 1.9779853235490328, + "step": 5930 + }, + { + "epoch": 1.9779853235490328, + "ref_ce_loss": 0.49499446153640747, + "step": 5930 + }, + { + "epoch": 1.9813208805870581, + "loss": 0.8654, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "grad_norm": 27.285018920898438, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "learning_rate": 0.00026425011555953145, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "loss": 1.115926742553711, + "step": 5940 + }, + { + "ce_loss": 0.23986463248729706, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "distill_loss": 0.6295623779296875, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "ref_ce_loss": 0.11818370968103409, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "loss": 0.8795012831687927, + "step": 5940 + }, + { + "ce_loss": 0.1840430498123169, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "distill_loss": 0.5369489789009094, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "ref_ce_loss": 0.08884285390377045, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "loss": 1.0463898181915283, + "step": 5940 + }, + { + "ce_loss": 0.26496535539627075, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "distill_loss": 0.6432059407234192, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "ref_ce_loss": 0.13793134689331055, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "loss": 1.0505305528640747, + "step": 5940 + }, + { + "ce_loss": 0.23481756448745728, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "distill_loss": 0.6528819799423218, + "epoch": 1.9813208805870581, + "step": 5940 + }, + { + "epoch": 1.9813208805870581, + "ref_ce_loss": 0.10958369076251984, + "step": 5940 + }, + { + "epoch": 1.9846564376250835, + "loss": 0.8436, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "grad_norm": 3.0225157737731934, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "learning_rate": 0.00026411875807560075, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "loss": 0.6642615795135498, + "step": 5950 + }, + { + "ce_loss": 0.11888474971055984, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "distill_loss": 0.2751652002334595, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "ref_ce_loss": 0.13492824137210846, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "loss": 0.7998644113540649, + "step": 5950 + }, + { + "ce_loss": 0.17532217502593994, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "distill_loss": 0.35970568656921387, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "ref_ce_loss": 0.13275721669197083, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "loss": 0.8139857649803162, + "step": 5950 + }, + { + "ce_loss": 0.2516137659549713, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "distill_loss": 0.30513206124305725, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "ref_ce_loss": 0.1743960827589035, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "loss": 0.5148938894271851, + "step": 5950 + }, + { + "ce_loss": 0.11828169226646423, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "distill_loss": 0.2190283238887787, + "epoch": 1.9846564376250835, + "step": 5950 + }, + { + "epoch": 1.9846564376250835, + "ref_ce_loss": 0.11594554036855698, + "step": 5950 + }, + { + "epoch": 1.9879919946631088, + "loss": 0.7121, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "grad_norm": 3.4296367168426514, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "learning_rate": 0.00026398719248407147, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "loss": 0.818779706954956, + "step": 5960 + }, + { + "ce_loss": 0.2508390247821808, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "distill_loss": 0.18431255221366882, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "ref_ce_loss": 0.1367606669664383, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "loss": 0.5594602823257446, + "step": 5960 + }, + { + "ce_loss": 0.20339643955230713, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "distill_loss": 0.18110495805740356, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "ref_ce_loss": 0.13036209344863892, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "loss": 0.5207152366638184, + "step": 5960 + }, + { + "ce_loss": 0.15699785947799683, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "distill_loss": 0.18864737451076508, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "ref_ce_loss": 0.12153347581624985, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "loss": 0.4976940453052521, + "step": 5960 + }, + { + "ce_loss": 0.17792610824108124, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "distill_loss": 0.14083155989646912, + "epoch": 1.9879919946631088, + "step": 5960 + }, + { + "epoch": 1.9879919946631088, + "ref_ce_loss": 0.0966520607471466, + "step": 5960 + }, + { + "epoch": 1.9913275517011342, + "loss": 0.5663, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "grad_norm": 3.2900636196136475, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "learning_rate": 0.0002638554190248674, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "loss": 0.6181614995002747, + "step": 5970 + }, + { + "ce_loss": 0.1431739330291748, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "distill_loss": 0.20133964717388153, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "ref_ce_loss": 0.1477632224559784, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "loss": 0.5945953130722046, + "step": 5970 + }, + { + "ce_loss": 0.17509131133556366, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "distill_loss": 0.13901633024215698, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "ref_ce_loss": 0.13561907410621643, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "loss": 0.7570385336875916, + "step": 5970 + }, + { + "ce_loss": 0.19282585382461548, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "distill_loss": 0.2022811472415924, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "ref_ce_loss": 0.19970786571502686, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "loss": 0.686042070388794, + "step": 5970 + }, + { + "ce_loss": 0.19220466911792755, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "distill_loss": 0.22768956422805786, + "epoch": 1.9913275517011342, + "step": 5970 + }, + { + "epoch": 1.9913275517011342, + "ref_ce_loss": 0.12446795403957367, + "step": 5970 + }, + { + "epoch": 1.9946631087391595, + "loss": 0.7199, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "grad_norm": 4.222412109375, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "learning_rate": 0.0002637234379382913, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "loss": 0.5888592004776001, + "step": 5980 + }, + { + "ce_loss": 0.11838914453983307, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "distill_loss": 0.16122058033943176, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "ref_ce_loss": 0.12328799068927765, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "loss": 0.8882757425308228, + "step": 5980 + }, + { + "ce_loss": 0.2505829334259033, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "distill_loss": 0.33013272285461426, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "ref_ce_loss": 0.184593066573143, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "loss": 0.6736380457878113, + "step": 5980 + }, + { + "ce_loss": 0.16651169955730438, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "distill_loss": 0.18379709124565125, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "ref_ce_loss": 0.13057413697242737, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "loss": 1.0884795188903809, + "step": 5980 + }, + { + "ce_loss": 0.18364958465099335, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "distill_loss": 0.2071857750415802, + "epoch": 1.9946631087391595, + "step": 5980 + }, + { + "epoch": 1.9946631087391595, + "ref_ce_loss": 0.12370967864990234, + "step": 5980 + }, + { + "epoch": 1.9979986657771849, + "loss": 0.6674, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "grad_norm": 2.313319683074951, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "learning_rate": 0.0002635912494650246, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "loss": 0.6827449202537537, + "step": 5990 + }, + { + "ce_loss": 0.18689732253551483, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "distill_loss": 0.19142132997512817, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "ref_ce_loss": 0.08761729300022125, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "loss": 0.7562812566757202, + "step": 5990 + }, + { + "ce_loss": 0.2163257747888565, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "distill_loss": 0.2145807296037674, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "ref_ce_loss": 0.1543344110250473, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "loss": 0.48378419876098633, + "step": 5990 + }, + { + "ce_loss": 0.1646513193845749, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "distill_loss": 0.19733795523643494, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "ref_ce_loss": 0.12141157686710358, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "loss": 0.5468843579292297, + "step": 5990 + }, + { + "ce_loss": 0.18428729474544525, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "distill_loss": 0.2126753032207489, + "epoch": 1.9979986657771849, + "step": 5990 + }, + { + "epoch": 1.9979986657771849, + "ref_ce_loss": 0.11773700267076492, + "step": 5990 + }, + { + "epoch": 2.0013342228152102, + "loss": 0.672, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "grad_norm": 3.492262840270996, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "learning_rate": 0.00026345885384612705, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "loss": 0.6723750233650208, + "step": 6000 + }, + { + "ce_loss": 0.18430662155151367, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "distill_loss": 0.2313898801803589, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "ref_ce_loss": 0.08441024273633957, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "loss": 0.5557563304901123, + "step": 6000 + }, + { + "ce_loss": 0.11034717410802841, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "distill_loss": 0.15689697861671448, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "ref_ce_loss": 0.11964371055364609, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "loss": 0.6583637595176697, + "step": 6000 + }, + { + "ce_loss": 0.20213940739631653, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "distill_loss": 0.16685333847999573, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "ref_ce_loss": 0.12898676097393036, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "loss": 0.6852834820747375, + "step": 6000 + }, + { + "ce_loss": 0.2616039216518402, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "distill_loss": 0.1978679597377777, + "epoch": 2.0013342228152102, + "step": 6000 + }, + { + "epoch": 2.0013342228152102, + "ref_ce_loss": 0.161850243806839, + "step": 6000 + }, + { + "epoch": 2.0046697798532356, + "loss": 0.6376, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "grad_norm": 2.2328505516052246, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "learning_rate": 0.00026332625132303593, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "loss": 0.6028956174850464, + "step": 6010 + }, + { + "ce_loss": 0.09909051656723022, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "distill_loss": 0.12309257686138153, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "ref_ce_loss": 0.11756174266338348, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "loss": 0.5768246054649353, + "step": 6010 + }, + { + "ce_loss": 0.21978306770324707, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "distill_loss": 0.1698399931192398, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "ref_ce_loss": 0.14700907468795776, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "loss": 0.3713357448577881, + "step": 6010 + }, + { + "ce_loss": 0.08948267996311188, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "distill_loss": 0.12198400497436523, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "ref_ce_loss": 0.10145905613899231, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "loss": 0.5854678750038147, + "step": 6010 + }, + { + "ce_loss": 0.1666143238544464, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "distill_loss": 0.19001466035842896, + "epoch": 2.0046697798532356, + "step": 6010 + }, + { + "epoch": 2.0046697798532356, + "ref_ce_loss": 0.1575281322002411, + "step": 6010 + }, + { + "epoch": 2.008005336891261, + "loss": 0.5722, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "grad_norm": 3.4229934215545654, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "learning_rate": 0.0002631934421375659, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "loss": 0.6702373027801514, + "step": 6020 + }, + { + "ce_loss": 0.2024572342634201, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "distill_loss": 0.1616104245185852, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "ref_ce_loss": 0.11883606016635895, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "loss": 0.3642963171005249, + "step": 6020 + }, + { + "ce_loss": 0.08134083449840546, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "distill_loss": 0.09039433300495148, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "ref_ce_loss": 0.11555787175893784, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "loss": 0.7688882946968079, + "step": 6020 + }, + { + "ce_loss": 0.22774671018123627, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "distill_loss": 0.14509688317775726, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "ref_ce_loss": 0.14968301355838776, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "loss": 1.2047111988067627, + "step": 6020 + }, + { + "ce_loss": 0.17515689134597778, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "distill_loss": 0.1401824802160263, + "epoch": 2.008005336891261, + "step": 6020 + }, + { + "epoch": 2.008005336891261, + "ref_ce_loss": 0.10154224187135696, + "step": 6020 + }, + { + "epoch": 2.0113408939292863, + "loss": 0.5728, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "grad_norm": 2.178412437438965, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "learning_rate": 0.00026306042653190866, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "loss": 0.35371965169906616, + "step": 6030 + }, + { + "ce_loss": 0.11235601454973221, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "distill_loss": 0.09209266304969788, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "ref_ce_loss": 0.09106268733739853, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "loss": 0.48361748456954956, + "step": 6030 + }, + { + "ce_loss": 0.13192753493785858, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "distill_loss": 0.12088339775800705, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "ref_ce_loss": 0.18421778082847595, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "loss": 0.4473074674606323, + "step": 6030 + }, + { + "ce_loss": 0.1566929668188095, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "distill_loss": 0.10360036790370941, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "ref_ce_loss": 0.14987316727638245, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "loss": 0.7593709230422974, + "step": 6030 + }, + { + "ce_loss": 0.2081209421157837, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "distill_loss": 0.11178325861692429, + "epoch": 2.0113408939292863, + "step": 6030 + }, + { + "epoch": 2.0113408939292863, + "ref_ce_loss": 0.10972130298614502, + "step": 6030 + }, + { + "epoch": 2.0146764509673116, + "loss": 0.5387, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "grad_norm": 3.3054556846618652, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "learning_rate": 0.0002629272047486321, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "loss": 0.5572177767753601, + "step": 6040 + }, + { + "ce_loss": 0.2259761095046997, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "distill_loss": 0.12743355333805084, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "ref_ce_loss": 0.12631645798683167, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "loss": 0.44857120513916016, + "step": 6040 + }, + { + "ce_loss": 0.16541236639022827, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "distill_loss": 0.09709923714399338, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "ref_ce_loss": 0.13638761639595032, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "loss": 0.3600292205810547, + "step": 6040 + }, + { + "ce_loss": 0.17759163677692413, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "distill_loss": 0.09841334074735641, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "ref_ce_loss": 0.08374278247356415, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "loss": 1.024361252784729, + "step": 6040 + }, + { + "ce_loss": 0.1911391019821167, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "distill_loss": 0.13310448825359344, + "epoch": 2.0146764509673116, + "step": 6040 + }, + { + "epoch": 2.0146764509673116, + "ref_ce_loss": 0.11508171260356903, + "step": 6040 + }, + { + "epoch": 2.018012008005337, + "loss": 0.5694, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "grad_norm": 2.28583025932312, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "learning_rate": 0.0002627937770306802, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "loss": 0.4060389995574951, + "step": 6050 + }, + { + "ce_loss": 0.1576002985239029, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "distill_loss": 0.11558900773525238, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "ref_ce_loss": 0.09974104911088943, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "loss": 0.5782639384269714, + "step": 6050 + }, + { + "ce_loss": 0.2112276256084442, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "distill_loss": 0.1430872231721878, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "ref_ce_loss": 0.1537550836801529, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "loss": 0.6270675659179688, + "step": 6050 + }, + { + "ce_loss": 0.20577941834926605, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "distill_loss": 0.15070682764053345, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "ref_ce_loss": 0.15376164019107819, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "loss": 0.42507204413414, + "step": 6050 + }, + { + "ce_loss": 0.1491844207048416, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "distill_loss": 0.12194012105464935, + "epoch": 2.018012008005337, + "step": 6050 + }, + { + "epoch": 2.018012008005337, + "ref_ce_loss": 0.11052257567644119, + "step": 6050 + }, + { + "epoch": 2.0213475650433623, + "loss": 0.5665, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "grad_norm": 2.7430214881896973, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "learning_rate": 0.0002626601436213725, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "loss": 0.39292192459106445, + "step": 6060 + }, + { + "ce_loss": 0.13748258352279663, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "distill_loss": 0.1628316044807434, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "ref_ce_loss": 0.09246788918972015, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "loss": 0.6393857002258301, + "step": 6060 + }, + { + "ce_loss": 0.2324545830488205, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "distill_loss": 0.22455164790153503, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "ref_ce_loss": 0.13880237936973572, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "loss": 0.3977337181568146, + "step": 6060 + }, + { + "ce_loss": 0.11556189507246017, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "distill_loss": 0.16048678755760193, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "ref_ce_loss": 0.09150765091180801, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "loss": 0.6872441172599792, + "step": 6060 + }, + { + "ce_loss": 0.15894901752471924, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "distill_loss": 0.1782384216785431, + "epoch": 2.0213475650433623, + "step": 6060 + }, + { + "epoch": 2.0213475650433623, + "ref_ce_loss": 0.12021970003843307, + "step": 6060 + }, + { + "epoch": 2.0246831220813877, + "loss": 0.6173, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "grad_norm": 3.887601137161255, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "learning_rate": 0.00026252630476440367, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "loss": 0.47213730216026306, + "step": 6070 + }, + { + "ce_loss": 0.12705136835575104, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "distill_loss": 0.16267943382263184, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "ref_ce_loss": 0.062675841152668, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "loss": 0.5571383833885193, + "step": 6070 + }, + { + "ce_loss": 0.12326113879680634, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "distill_loss": 0.1477968990802765, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "ref_ce_loss": 0.13798700273036957, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "loss": 0.5846275687217712, + "step": 6070 + }, + { + "ce_loss": 0.16185100376605988, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "distill_loss": 0.18354424834251404, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "ref_ce_loss": 0.13381104171276093, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "loss": 0.863525390625, + "step": 6070 + }, + { + "ce_loss": 0.319327712059021, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "distill_loss": 0.27865153551101685, + "epoch": 2.0246831220813877, + "step": 6070 + }, + { + "epoch": 2.0246831220813877, + "ref_ce_loss": 0.19892120361328125, + "step": 6070 + }, + { + "epoch": 2.028018679119413, + "loss": 0.6443, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "grad_norm": 1.872769832611084, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "learning_rate": 0.0002623922607038429, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "loss": 0.41863390803337097, + "step": 6080 + }, + { + "ce_loss": 0.0914531722664833, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "distill_loss": 0.22104503214359283, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "ref_ce_loss": 0.10593371838331223, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "loss": 0.44628605246543884, + "step": 6080 + }, + { + "ce_loss": 0.13750892877578735, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "distill_loss": 0.18723967671394348, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "ref_ce_loss": 0.08156903088092804, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "loss": 0.6728360652923584, + "step": 6080 + }, + { + "ce_loss": 0.19420503079891205, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "distill_loss": 0.20174559950828552, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "ref_ce_loss": 0.13595017790794373, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "loss": 1.2898201942443848, + "step": 6080 + }, + { + "ce_loss": 0.36071357131004333, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "distill_loss": 0.251817524433136, + "epoch": 2.028018679119413, + "step": 6080 + }, + { + "epoch": 2.028018679119413, + "ref_ce_loss": 0.1717768758535385, + "step": 6080 + }, + { + "epoch": 2.0313542361574384, + "loss": 0.6395, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "grad_norm": 3.3039674758911133, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "learning_rate": 0.00026225801168413377, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "loss": 0.5026826858520508, + "step": 6090 + }, + { + "ce_loss": 0.13728295266628265, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "distill_loss": 0.1531429886817932, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "ref_ce_loss": 0.1067405492067337, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "loss": 0.552277684211731, + "step": 6090 + }, + { + "ce_loss": 0.17691931128501892, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "distill_loss": 0.16204652190208435, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "ref_ce_loss": 0.11657890677452087, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "loss": 0.32724007964134216, + "step": 6090 + }, + { + "ce_loss": 0.05284838378429413, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "distill_loss": 0.12443891167640686, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "ref_ce_loss": 0.149708092212677, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "loss": 0.3944059908390045, + "step": 6090 + }, + { + "ce_loss": 0.137527734041214, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "distill_loss": 0.16900943219661713, + "epoch": 2.0313542361574384, + "step": 6090 + }, + { + "epoch": 2.0313542361574384, + "ref_ce_loss": 0.08755436539649963, + "step": 6090 + }, + { + "epoch": 2.0346897931954637, + "loss": 0.552, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "grad_norm": 4.813361167907715, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "learning_rate": 0.00026212355795009353, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "loss": 0.4744175970554352, + "step": 6100 + }, + { + "ce_loss": 0.14979687333106995, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "distill_loss": 0.1268942803144455, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "ref_ce_loss": 0.14003252983093262, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "loss": 0.5676864981651306, + "step": 6100 + }, + { + "ce_loss": 0.16010522842407227, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "distill_loss": 0.11075478047132492, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "ref_ce_loss": 0.13726557791233063, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "loss": 0.5984420776367188, + "step": 6100 + }, + { + "ce_loss": 0.1581433117389679, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "distill_loss": 0.15865829586982727, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "ref_ce_loss": 0.1082833856344223, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "loss": 0.5503508448600769, + "step": 6100 + }, + { + "ce_loss": 0.16133596003055573, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "distill_loss": 0.11832837760448456, + "epoch": 2.0346897931954637, + "step": 6100 + }, + { + "epoch": 2.0346897931954637, + "ref_ce_loss": 0.08712395280599594, + "step": 6100 + }, + { + "epoch": 2.038025350233489, + "loss": 0.5482, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "grad_norm": 5.132610321044922, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "learning_rate": 0.00026198889974691266, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "loss": 0.6514615416526794, + "step": 6110 + }, + { + "ce_loss": 0.16826602816581726, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "distill_loss": 0.1439044177532196, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "ref_ce_loss": 0.1011396050453186, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "loss": 0.5035021901130676, + "step": 6110 + }, + { + "ce_loss": 0.17770890891551971, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "distill_loss": 0.1277397722005844, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "ref_ce_loss": 0.14741405844688416, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "loss": 0.7693830728530884, + "step": 6110 + }, + { + "ce_loss": 0.20457173883914948, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "distill_loss": 0.11660545319318771, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "ref_ce_loss": 0.16994109749794006, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "loss": 0.36567243933677673, + "step": 6110 + }, + { + "ce_loss": 0.11716575175523758, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "distill_loss": 0.09368336945772171, + "epoch": 2.038025350233489, + "step": 6110 + }, + { + "epoch": 2.038025350233489, + "ref_ce_loss": 0.10542420297861099, + "step": 6110 + }, + { + "epoch": 2.0413609072715144, + "loss": 0.5739, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "grad_norm": 2.7173047065734863, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "learning_rate": 0.00026185403732015473, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "loss": 0.5491402745246887, + "step": 6120 + }, + { + "ce_loss": 0.2195517122745514, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "distill_loss": 0.11219502240419388, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "ref_ce_loss": 0.12330888211727142, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "loss": 0.6513844132423401, + "step": 6120 + }, + { + "ce_loss": 0.2567315697669983, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "distill_loss": 0.11513234674930573, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "ref_ce_loss": 0.17506708204746246, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "loss": 0.6120648384094238, + "step": 6120 + }, + { + "ce_loss": 0.19124412536621094, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "distill_loss": 0.10679781436920166, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "ref_ce_loss": 0.1378684788942337, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "loss": 0.3801417648792267, + "step": 6120 + }, + { + "ce_loss": 0.149167001247406, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "distill_loss": 0.12198339402675629, + "epoch": 2.0413609072715144, + "step": 6120 + }, + { + "epoch": 2.0413609072715144, + "ref_ce_loss": 0.08929223567247391, + "step": 6120 + }, + { + "epoch": 2.0446964643095398, + "loss": 0.5375, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "grad_norm": 2.9968857765197754, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "learning_rate": 0.0002617189709157555, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "loss": 0.5337057113647461, + "step": 6130 + }, + { + "ce_loss": 0.216720849275589, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "distill_loss": 0.13213863968849182, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "ref_ce_loss": 0.14722806215286255, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "loss": 0.4620002508163452, + "step": 6130 + }, + { + "ce_loss": 0.13662521541118622, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "distill_loss": 0.0959618091583252, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "ref_ce_loss": 0.12215406447649002, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "loss": 0.4098561406135559, + "step": 6130 + }, + { + "ce_loss": 0.12565305829048157, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "distill_loss": 0.10984811186790466, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "ref_ce_loss": 0.13379180431365967, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "loss": 0.4601963758468628, + "step": 6130 + }, + { + "ce_loss": 0.11863856762647629, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "distill_loss": 0.08010222762823105, + "epoch": 2.0446964643095398, + "step": 6130 + }, + { + "epoch": 2.0446964643095398, + "ref_ce_loss": 0.12990163266658783, + "step": 6130 + }, + { + "epoch": 2.048032021347565, + "loss": 0.5323, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "grad_norm": 2.794053077697754, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "learning_rate": 0.0002615837007800229, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "loss": 0.7886110544204712, + "step": 6140 + }, + { + "ce_loss": 0.15798018872737885, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "distill_loss": 0.17019721865653992, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "ref_ce_loss": 0.13794007897377014, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "loss": 0.4465550482273102, + "step": 6140 + }, + { + "ce_loss": 0.17327573895454407, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "distill_loss": 0.12840725481510162, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "ref_ce_loss": 0.10248830169439316, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "loss": 0.5394412279129028, + "step": 6140 + }, + { + "ce_loss": 0.14059796929359436, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "distill_loss": 0.1589897871017456, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "ref_ce_loss": 0.12873795628547668, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "loss": 0.653471827507019, + "step": 6140 + }, + { + "ce_loss": 0.2507249712944031, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "distill_loss": 0.15221871435642242, + "epoch": 2.048032021347565, + "step": 6140 + }, + { + "epoch": 2.048032021347565, + "ref_ce_loss": 0.12897253036499023, + "step": 6140 + }, + { + "epoch": 2.0513675783855905, + "loss": 0.6209, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "grad_norm": 16.627849578857422, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "learning_rate": 0.00026144822715963627, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "loss": 0.5434788465499878, + "step": 6150 + }, + { + "ce_loss": 0.1266670972108841, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "distill_loss": 0.25511863827705383, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "ref_ce_loss": 0.09655553847551346, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "loss": 0.7020452618598938, + "step": 6150 + }, + { + "ce_loss": 0.1957281082868576, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "distill_loss": 0.279159814119339, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "ref_ce_loss": 0.09773007780313492, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "loss": 0.589733898639679, + "step": 6150 + }, + { + "ce_loss": 0.2290581315755844, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "distill_loss": 0.2273220419883728, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "ref_ce_loss": 0.13324572145938873, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "loss": 0.6807817816734314, + "step": 6150 + }, + { + "ce_loss": 0.21881437301635742, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "distill_loss": 0.31649279594421387, + "epoch": 2.0513675783855905, + "step": 6150 + }, + { + "epoch": 2.0513675783855905, + "ref_ce_loss": 0.14447903633117676, + "step": 6150 + }, + { + "epoch": 2.054703135423616, + "loss": 0.6239, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "grad_norm": 3.414583206176758, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "learning_rate": 0.000261312550301646, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "loss": 0.7235752940177917, + "step": 6160 + }, + { + "ce_loss": 0.26676416397094727, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "distill_loss": 0.2171621322631836, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "ref_ce_loss": 0.1268942803144455, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "loss": 0.5398198962211609, + "step": 6160 + }, + { + "ce_loss": 0.17983555793762207, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "distill_loss": 0.22419527173042297, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "ref_ce_loss": 0.08115441352128983, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "loss": 0.5893262624740601, + "step": 6160 + }, + { + "ce_loss": 0.16456808149814606, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "distill_loss": 0.13162600994110107, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "ref_ce_loss": 0.13405293226242065, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "loss": 0.5403462052345276, + "step": 6160 + }, + { + "ce_loss": 0.19443656504154205, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "distill_loss": 0.17818742990493774, + "epoch": 2.054703135423616, + "step": 6160 + }, + { + "epoch": 2.054703135423616, + "ref_ce_loss": 0.13629718124866486, + "step": 6160 + }, + { + "epoch": 2.058038692461641, + "loss": 0.5894, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "grad_norm": 2.5601234436035156, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "learning_rate": 0.0002611766704534732, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "loss": 0.49508172273635864, + "step": 6170 + }, + { + "ce_loss": 0.14286686480045319, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "distill_loss": 0.14650699496269226, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "ref_ce_loss": 0.15913353860378265, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "loss": 0.3224010467529297, + "step": 6170 + }, + { + "ce_loss": 0.11369533091783524, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "distill_loss": 0.13524405658245087, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "ref_ce_loss": 0.07325857877731323, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "loss": 0.43581756949424744, + "step": 6170 + }, + { + "ce_loss": 0.13941310346126556, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "distill_loss": 0.17731405794620514, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "ref_ce_loss": 0.08746254444122314, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "loss": 0.5797979235649109, + "step": 6170 + }, + { + "ce_loss": 0.15138761699199677, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "distill_loss": 0.1986589878797531, + "epoch": 2.058038692461641, + "step": 6170 + }, + { + "epoch": 2.058038692461641, + "ref_ce_loss": 0.11812014132738113, + "step": 6170 + }, + { + "epoch": 2.0613742494996665, + "loss": 0.5313, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "grad_norm": 2.2964236736297607, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "learning_rate": 0.00026104058786290905, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "loss": 0.44557082653045654, + "step": 6180 + }, + { + "ce_loss": 0.12464036047458649, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "distill_loss": 0.10614349693059921, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "ref_ce_loss": 0.09893155843019485, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "loss": 0.522710919380188, + "step": 6180 + }, + { + "ce_loss": 0.16048184037208557, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "distill_loss": 0.13187257945537567, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "ref_ce_loss": 0.1442248374223709, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "loss": 0.6206411123275757, + "step": 6180 + }, + { + "ce_loss": 0.1626727133989334, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "distill_loss": 0.14576083421707153, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "ref_ce_loss": 0.12461519986391068, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "loss": 0.7486047744750977, + "step": 6180 + }, + { + "ce_loss": 0.2025829553604126, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "distill_loss": 0.13131900131702423, + "epoch": 2.0613742494996665, + "step": 6180 + }, + { + "epoch": 2.0613742494996665, + "ref_ce_loss": 0.13520632684230804, + "step": 6180 + }, + { + "epoch": 2.064709806537692, + "loss": 0.6143, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "grad_norm": 2.8212482929229736, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "learning_rate": 0.0002609043027781146, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "loss": 0.5236290097236633, + "step": 6190 + }, + { + "ce_loss": 0.19522197544574738, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "distill_loss": 0.10594858974218369, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "ref_ce_loss": 0.11298041045665741, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "loss": 0.42183634638786316, + "step": 6190 + }, + { + "ce_loss": 0.15197670459747314, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "distill_loss": 0.11152088642120361, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "ref_ce_loss": 0.15807226300239563, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "loss": 0.671714186668396, + "step": 6190 + }, + { + "ce_loss": 0.26314622163772583, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "distill_loss": 0.13445574045181274, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "ref_ce_loss": 0.18136006593704224, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "loss": 0.40071842074394226, + "step": 6190 + }, + { + "ce_loss": 0.19335207343101501, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "distill_loss": 0.09611434489488602, + "epoch": 2.064709806537692, + "step": 6190 + }, + { + "epoch": 2.064709806537692, + "ref_ce_loss": 0.07939335703849792, + "step": 6190 + }, + { + "epoch": 2.068045363575717, + "loss": 0.5268, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "grad_norm": 4.135841369628906, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "learning_rate": 0.00026076781544762015, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "loss": 0.5495902299880981, + "step": 6200 + }, + { + "ce_loss": 0.2173391878604889, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "distill_loss": 0.10663158446550369, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "ref_ce_loss": 0.14941971004009247, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "loss": 0.8383069038391113, + "step": 6200 + }, + { + "ce_loss": 0.12289828807115555, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "distill_loss": 0.171888530254364, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "ref_ce_loss": 0.13419800996780396, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "loss": 0.5588772892951965, + "step": 6200 + }, + { + "ce_loss": 0.18625810742378235, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "distill_loss": 0.14179158210754395, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "ref_ce_loss": 0.11892954260110855, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "loss": 0.808592677116394, + "step": 6200 + }, + { + "ce_loss": 0.20481625199317932, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "distill_loss": 0.17151331901550293, + "epoch": 2.068045363575717, + "step": 6200 + }, + { + "epoch": 2.068045363575717, + "ref_ce_loss": 0.06582659482955933, + "step": 6200 + }, + { + "epoch": 2.0713809206137426, + "loss": 0.6267, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "grad_norm": 3.014910936355591, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "learning_rate": 0.00026063112612032457, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "loss": 0.3734577000141144, + "step": 6210 + }, + { + "ce_loss": 0.09334731101989746, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "distill_loss": 0.1371557116508484, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "ref_ce_loss": 0.09609896689653397, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "loss": 0.7269737124443054, + "step": 6210 + }, + { + "ce_loss": 0.27129364013671875, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "distill_loss": 0.23927046358585358, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "ref_ce_loss": 0.12874388694763184, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "loss": 0.68470299243927, + "step": 6210 + }, + { + "ce_loss": 0.11388979107141495, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "distill_loss": 0.15361973643302917, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "ref_ce_loss": 0.1297324299812317, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "loss": 1.0848585367202759, + "step": 6210 + }, + { + "ce_loss": 0.14715316891670227, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "distill_loss": 0.18684878945350647, + "epoch": 2.0713809206137426, + "step": 6210 + }, + { + "epoch": 2.0713809206137426, + "ref_ce_loss": 0.08337806910276413, + "step": 6210 + }, + { + "epoch": 2.074716477651768, + "loss": 0.5362, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "grad_norm": 3.284393072128296, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "learning_rate": 0.00026049423504549544, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "loss": 0.31834161281585693, + "step": 6220 + }, + { + "ce_loss": 0.079567089676857, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "distill_loss": 0.15186390280723572, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "ref_ce_loss": 0.08671994507312775, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "loss": 0.8450188636779785, + "step": 6220 + }, + { + "ce_loss": 0.31348034739494324, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "distill_loss": 0.2693202793598175, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "ref_ce_loss": 0.18057137727737427, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "loss": 0.6984924077987671, + "step": 6220 + }, + { + "ce_loss": 0.18971113860607147, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "distill_loss": 0.14513225853443146, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "ref_ce_loss": 0.11545384675264359, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "loss": 0.43207088112831116, + "step": 6220 + }, + { + "ce_loss": 0.10766131430864334, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "distill_loss": 0.14692091941833496, + "epoch": 2.074716477651768, + "step": 6220 + }, + { + "epoch": 2.074716477651768, + "ref_ce_loss": 0.08508320897817612, + "step": 6220 + }, + { + "epoch": 2.0780520346897933, + "loss": 0.5354, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "grad_norm": 3.0783700942993164, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "learning_rate": 0.0002603571424727679, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "loss": 0.5964272618293762, + "step": 6230 + }, + { + "ce_loss": 0.22434349358081818, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "distill_loss": 0.095759816467762, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "ref_ce_loss": 0.1369304656982422, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "loss": 0.4027251601219177, + "step": 6230 + }, + { + "ce_loss": 0.0775301530957222, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "distill_loss": 0.10245171189308167, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "ref_ce_loss": 0.12542954087257385, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "loss": 0.38007214665412903, + "step": 6230 + }, + { + "ce_loss": 0.15331995487213135, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "distill_loss": 0.11959885060787201, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "ref_ce_loss": 0.10672856867313385, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "loss": 0.5621980428695679, + "step": 6230 + }, + { + "ce_loss": 0.26177355647087097, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "distill_loss": 0.15812274813652039, + "epoch": 2.0780520346897933, + "step": 6230 + }, + { + "epoch": 2.0780520346897933, + "ref_ce_loss": 0.09878107905387878, + "step": 6230 + }, + { + "epoch": 2.0813875917278186, + "loss": 0.5493, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "grad_norm": 2.6283464431762695, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "learning_rate": 0.00026021984865214493, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "loss": 0.516473650932312, + "step": 6240 + }, + { + "ce_loss": 0.22216933965682983, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "distill_loss": 0.14236842095851898, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "ref_ce_loss": 0.11821052432060242, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "loss": 1.0167242288589478, + "step": 6240 + }, + { + "ce_loss": 0.1872362345457077, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "distill_loss": 0.1137826144695282, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "ref_ce_loss": 0.07063769549131393, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "loss": 0.49501004815101624, + "step": 6240 + }, + { + "ce_loss": 0.19312813878059387, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "distill_loss": 0.1220867857336998, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "ref_ce_loss": 0.11821332573890686, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "loss": 0.5460554361343384, + "step": 6240 + }, + { + "ce_loss": 0.1809922456741333, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "distill_loss": 0.14729411900043488, + "epoch": 2.0813875917278186, + "step": 6240 + }, + { + "epoch": 2.0813875917278186, + "ref_ce_loss": 0.1758243888616562, + "step": 6240 + }, + { + "epoch": 2.084723148765844, + "loss": 0.5744, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "grad_norm": 2.9297170639038086, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "learning_rate": 0.00026008235383399614, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "loss": 0.5299870371818542, + "step": 6250 + }, + { + "ce_loss": 0.20783375203609467, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "distill_loss": 0.18256841599941254, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "ref_ce_loss": 0.10592147707939148, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "loss": 0.5724793672561646, + "step": 6250 + }, + { + "ce_loss": 0.13250119984149933, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "distill_loss": 0.1532224714756012, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "ref_ce_loss": 0.12249592691659927, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "loss": 0.601777195930481, + "step": 6250 + }, + { + "ce_loss": 0.26903092861175537, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "distill_loss": 0.16014234721660614, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "ref_ce_loss": 0.1334620863199234, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "loss": 0.5935980081558228, + "step": 6250 + }, + { + "ce_loss": 0.1840442568063736, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "distill_loss": 0.1481340080499649, + "epoch": 2.084723148765844, + "step": 6250 + }, + { + "epoch": 2.084723148765844, + "ref_ce_loss": 0.15468862652778625, + "step": 6250 + }, + { + "epoch": 2.0880587058038693, + "loss": 0.5446, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "grad_norm": 2.8151493072509766, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "learning_rate": 0.00025994465826905793, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "loss": 0.3203030228614807, + "step": 6260 + }, + { + "ce_loss": 0.12304674088954926, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "distill_loss": 0.11692545562982559, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "ref_ce_loss": 0.08023111522197723, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "loss": 0.5208563804626465, + "step": 6260 + }, + { + "ce_loss": 0.19634287059307098, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "distill_loss": 0.11992079019546509, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "ref_ce_loss": 0.15068615972995758, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "loss": 0.4007379114627838, + "step": 6260 + }, + { + "ce_loss": 0.13138967752456665, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "distill_loss": 0.10143236815929413, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "ref_ce_loss": 0.11003472656011581, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "loss": 0.44156375527381897, + "step": 6260 + }, + { + "ce_loss": 0.15144360065460205, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "distill_loss": 0.10493350028991699, + "epoch": 2.0880587058038693, + "step": 6260 + }, + { + "epoch": 2.0880587058038693, + "ref_ce_loss": 0.12747961282730103, + "step": 6260 + }, + { + "epoch": 2.0913942628418947, + "loss": 0.5177, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "grad_norm": 3.0881059169769287, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "learning_rate": 0.00025980676220843267, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "loss": 0.46956267952919006, + "step": 6270 + }, + { + "ce_loss": 0.18205973505973816, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "distill_loss": 0.1669866442680359, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "ref_ce_loss": 0.11989451199769974, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "loss": 0.4681810140609741, + "step": 6270 + }, + { + "ce_loss": 0.16584455966949463, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "distill_loss": 0.1511474847793579, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "ref_ce_loss": 0.10125041007995605, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "loss": 0.6433732509613037, + "step": 6270 + }, + { + "ce_loss": 0.23725828528404236, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "distill_loss": 0.12847734987735748, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "ref_ce_loss": 0.1302829384803772, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "loss": 0.7147667407989502, + "step": 6270 + }, + { + "ce_loss": 0.3272174298763275, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "distill_loss": 0.18335974216461182, + "epoch": 2.0913942628418947, + "step": 6270 + }, + { + "epoch": 2.0913942628418947, + "ref_ce_loss": 0.1645212322473526, + "step": 6270 + }, + { + "epoch": 2.09472981987992, + "loss": 0.5341, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "grad_norm": 2.9068071842193604, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "learning_rate": 0.0002596686659035884, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "loss": 0.6974978446960449, + "step": 6280 + }, + { + "ce_loss": 0.2411918044090271, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "distill_loss": 0.19004279375076294, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "ref_ce_loss": 0.19078131020069122, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "loss": 0.8441246151924133, + "step": 6280 + }, + { + "ce_loss": 0.1914375275373459, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "distill_loss": 0.18291591107845306, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "ref_ce_loss": 0.1954183131456375, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "loss": 0.700549304485321, + "step": 6280 + }, + { + "ce_loss": 0.17326299846172333, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "distill_loss": 0.22238874435424805, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "ref_ce_loss": 0.1731567084789276, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "loss": 0.5677235126495361, + "step": 6280 + }, + { + "ce_loss": 0.2340429276227951, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "distill_loss": 0.13371285796165466, + "epoch": 2.09472981987992, + "step": 6280 + }, + { + "epoch": 2.09472981987992, + "ref_ce_loss": 0.14143046736717224, + "step": 6280 + }, + { + "epoch": 2.0980653769179454, + "loss": 0.5686, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "grad_norm": 2.8393349647521973, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "learning_rate": 0.0002595303696063582, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "loss": 0.4551650583744049, + "step": 6290 + }, + { + "ce_loss": 0.1547398418188095, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "distill_loss": 0.11198326200246811, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "ref_ce_loss": 0.1279253214597702, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "loss": 0.6482787132263184, + "step": 6290 + }, + { + "ce_loss": 0.25222861766815186, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "distill_loss": 0.13030463457107544, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "ref_ce_loss": 0.14368154108524323, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "loss": 0.662798285484314, + "step": 6290 + }, + { + "ce_loss": 0.20530973374843597, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "distill_loss": 0.1648654192686081, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "ref_ce_loss": 0.09448906034231186, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "loss": 0.5949462652206421, + "step": 6290 + }, + { + "ce_loss": 0.2017897069454193, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "distill_loss": 0.14443297684192657, + "epoch": 2.0980653769179454, + "step": 6290 + }, + { + "epoch": 2.0980653769179454, + "ref_ce_loss": 0.1371644288301468, + "step": 6290 + }, + { + "epoch": 2.1014009339559707, + "loss": 0.5499, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "grad_norm": 3.2759594917297363, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "learning_rate": 0.0002593918735689401, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "loss": 0.4449702501296997, + "step": 6300 + }, + { + "ce_loss": 0.12942036986351013, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "distill_loss": 0.11134645342826843, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "ref_ce_loss": 0.08583880960941315, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "loss": 0.6386305093765259, + "step": 6300 + }, + { + "ce_loss": 0.14869295060634613, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "distill_loss": 0.1496465802192688, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "ref_ce_loss": 0.1405908614397049, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "loss": 1.184739351272583, + "step": 6300 + }, + { + "ce_loss": 0.17178437113761902, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "distill_loss": 0.10091239213943481, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "ref_ce_loss": 0.12160031497478485, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "loss": 0.4029628336429596, + "step": 6300 + }, + { + "ce_loss": 0.09635565429925919, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "distill_loss": 0.10692538321018219, + "epoch": 2.1014009339559707, + "step": 6300 + }, + { + "epoch": 2.1014009339559707, + "ref_ce_loss": 0.12069247663021088, + "step": 6300 + }, + { + "epoch": 2.104736490993996, + "loss": 0.5475, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "grad_norm": 2.3113832473754883, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "learning_rate": 0.0002592531780438962, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "loss": 0.704979658126831, + "step": 6310 + }, + { + "ce_loss": 0.18846255540847778, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "distill_loss": 0.10475876927375793, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "ref_ce_loss": 0.20854370296001434, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "loss": 0.6280515193939209, + "step": 6310 + }, + { + "ce_loss": 0.13968916237354279, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "distill_loss": 0.10155282914638519, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "ref_ce_loss": 0.10801265388727188, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "loss": 0.4487094283103943, + "step": 6310 + }, + { + "ce_loss": 0.05168895050883293, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "distill_loss": 0.09908200800418854, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "ref_ce_loss": 0.07915835827589035, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "loss": 0.5438628196716309, + "step": 6310 + }, + { + "ce_loss": 0.15742728114128113, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "distill_loss": 0.1507430076599121, + "epoch": 2.104736490993996, + "step": 6310 + }, + { + "epoch": 2.104736490993996, + "ref_ce_loss": 0.1463308483362198, + "step": 6310 + }, + { + "epoch": 2.1080720480320214, + "loss": 0.5094, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "grad_norm": 3.62620210647583, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "learning_rate": 0.0002591142832841524, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "loss": 0.5473156571388245, + "step": 6320 + }, + { + "ce_loss": 0.2167169153690338, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "distill_loss": 0.12036389857530594, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "ref_ce_loss": 0.14263157546520233, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "loss": 0.5363081693649292, + "step": 6320 + }, + { + "ce_loss": 0.18843142688274384, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "distill_loss": 0.11299119144678116, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "ref_ce_loss": 0.1542125791311264, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "loss": 0.5071948170661926, + "step": 6320 + }, + { + "ce_loss": 0.1818801909685135, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "distill_loss": 0.10943155735731125, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "ref_ce_loss": 0.10278405249118805, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "loss": 0.5227207541465759, + "step": 6320 + }, + { + "ce_loss": 0.2548081874847412, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "distill_loss": 0.10464075952768326, + "epoch": 2.1080720480320214, + "step": 6320 + }, + { + "epoch": 2.1080720480320214, + "ref_ce_loss": 0.16315731406211853, + "step": 6320 + }, + { + "epoch": 2.1114076050700468, + "loss": 0.5356, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "grad_norm": 2.8032443523406982, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "learning_rate": 0.0002589751895429979, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "loss": 0.5398804545402527, + "step": 6330 + }, + { + "ce_loss": 0.23482723534107208, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "distill_loss": 0.12033455818891525, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "ref_ce_loss": 0.13513240218162537, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "loss": 0.5462663769721985, + "step": 6330 + }, + { + "ce_loss": 0.16122372448444366, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "distill_loss": 0.0924907997250557, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "ref_ce_loss": 0.1783560961484909, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "loss": 0.7903030514717102, + "step": 6330 + }, + { + "ce_loss": 0.13481366634368896, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "distill_loss": 0.10490374267101288, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "ref_ce_loss": 0.144536554813385, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "loss": 0.7620557546615601, + "step": 6330 + }, + { + "ce_loss": 0.2825464606285095, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "distill_loss": 0.11635222285985947, + "epoch": 2.1114076050700468, + "step": 6330 + }, + { + "epoch": 2.1114076050700468, + "ref_ce_loss": 0.1272297352552414, + "step": 6330 + }, + { + "epoch": 2.114743162108072, + "loss": 0.5484, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "grad_norm": 3.093912124633789, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "learning_rate": 0.00025883589707408495, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "loss": 0.44437944889068604, + "step": 6340 + }, + { + "ce_loss": 0.10807617753744125, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "distill_loss": 0.08297376334667206, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "ref_ce_loss": 0.12267817556858063, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "loss": 0.4830673336982727, + "step": 6340 + }, + { + "ce_loss": 0.21065731346607208, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "distill_loss": 0.13647867739200592, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "ref_ce_loss": 0.10502800345420837, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "loss": 0.6829445362091064, + "step": 6340 + }, + { + "ce_loss": 0.20690180361270905, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "distill_loss": 0.10831039398908615, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "ref_ce_loss": 0.14528189599514008, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "loss": 0.39419299364089966, + "step": 6340 + }, + { + "ce_loss": 0.17007343471050262, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "distill_loss": 0.10755617171525955, + "epoch": 2.114743162108072, + "step": 6340 + }, + { + "epoch": 2.114743162108072, + "ref_ce_loss": 0.11648072302341461, + "step": 6340 + }, + { + "epoch": 2.1180787191460975, + "loss": 0.5393, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "grad_norm": 2.7056565284729004, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "learning_rate": 0.00025869640613142796, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "loss": 0.4408019483089447, + "step": 6350 + }, + { + "ce_loss": 0.1383562535047531, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "distill_loss": 0.12275628000497818, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "ref_ce_loss": 0.06599508970975876, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "loss": 0.30788254737854004, + "step": 6350 + }, + { + "ce_loss": 0.10194658488035202, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "distill_loss": 0.09040798991918564, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "ref_ce_loss": 0.06405435502529144, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "loss": 0.41332122683525085, + "step": 6350 + }, + { + "ce_loss": 0.13060200214385986, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "distill_loss": 0.13377758860588074, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "ref_ce_loss": 0.11965243518352509, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "loss": 0.43371546268463135, + "step": 6350 + }, + { + "ce_loss": 0.15489931404590607, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "distill_loss": 0.13392747938632965, + "epoch": 2.1180787191460975, + "step": 6350 + }, + { + "epoch": 2.1180787191460975, + "ref_ce_loss": 0.11093331128358841, + "step": 6350 + }, + { + "epoch": 2.121414276184123, + "loss": 0.5057, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "grad_norm": 1.7513467073440552, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "learning_rate": 0.00025855671696940345, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "loss": 0.318505197763443, + "step": 6360 + }, + { + "ce_loss": 0.08526570349931717, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "distill_loss": 0.07790465652942657, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "ref_ce_loss": 0.08237326890230179, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "loss": 0.42279934883117676, + "step": 6360 + }, + { + "ce_loss": 0.15253883600234985, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "distill_loss": 0.09407106041908264, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "ref_ce_loss": 0.09206009656190872, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "loss": 0.8274306058883667, + "step": 6360 + }, + { + "ce_loss": 0.36078494787216187, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "distill_loss": 0.15812142193317413, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "ref_ce_loss": 0.15409733355045319, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "loss": 0.4437326490879059, + "step": 6360 + }, + { + "ce_loss": 0.14433549344539642, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "distill_loss": 0.0919932946562767, + "epoch": 2.121414276184123, + "step": 6360 + }, + { + "epoch": 2.121414276184123, + "ref_ce_loss": 0.1464773565530777, + "step": 6360 + }, + { + "epoch": 2.124749833222148, + "loss": 0.5228, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "grad_norm": 2.1484248638153076, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "learning_rate": 0.0002584168298427493, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "loss": 0.43399181962013245, + "step": 6370 + }, + { + "ce_loss": 0.15099979937076569, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "distill_loss": 0.1152973547577858, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "ref_ce_loss": 0.11432129144668579, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "loss": 0.6378722786903381, + "step": 6370 + }, + { + "ce_loss": 0.1807975023984909, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "distill_loss": 0.08343417942523956, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "ref_ce_loss": 0.17199712991714478, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "loss": 0.2268414944410324, + "step": 6370 + }, + { + "ce_loss": 0.066691555082798, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "distill_loss": 0.07913003861904144, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "ref_ce_loss": 0.08088640868663788, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "loss": 1.0655454397201538, + "step": 6370 + }, + { + "ce_loss": 0.12815611064434052, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "distill_loss": 0.09509027004241943, + "epoch": 2.124749833222148, + "step": 6370 + }, + { + "epoch": 2.124749833222148, + "ref_ce_loss": 0.11971811205148697, + "step": 6370 + }, + { + "epoch": 2.1280853902601735, + "loss": 0.5187, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "grad_norm": 2.40598464012146, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "learning_rate": 0.00025827674500656446, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "loss": 0.6937955617904663, + "step": 6380 + }, + { + "ce_loss": 0.16688190400600433, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "distill_loss": 0.10379371047019958, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "ref_ce_loss": 0.15276721119880676, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "loss": 0.42396658658981323, + "step": 6380 + }, + { + "ce_loss": 0.08682675659656525, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "distill_loss": 0.10008202493190765, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "ref_ce_loss": 0.10740143060684204, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "loss": 1.1396608352661133, + "step": 6380 + }, + { + "ce_loss": 0.22738853096961975, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "distill_loss": 0.14909625053405762, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "ref_ce_loss": 0.20061470568180084, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "loss": 0.5962453484535217, + "step": 6380 + }, + { + "ce_loss": 0.1423494815826416, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "distill_loss": 0.09867765009403229, + "epoch": 2.1280853902601735, + "step": 6380 + }, + { + "epoch": 2.1280853902601735, + "ref_ce_loss": 0.16645397245883942, + "step": 6380 + }, + { + "epoch": 2.131420947298199, + "loss": 0.6273, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "grad_norm": 6.128455638885498, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "learning_rate": 0.0002581364627163084, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "loss": 0.45542195439338684, + "step": 6390 + }, + { + "ce_loss": 0.15142342448234558, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "distill_loss": 0.09832875430583954, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "ref_ce_loss": 0.08714140206575394, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "loss": 0.4087405800819397, + "step": 6390 + }, + { + "ce_loss": 0.13376466929912567, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "distill_loss": 0.09935194253921509, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "ref_ce_loss": 0.12032655626535416, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "loss": 0.4319303631782532, + "step": 6390 + }, + { + "ce_loss": 0.20519354939460754, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "distill_loss": 0.13323552906513214, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "ref_ce_loss": 0.09306564927101135, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "loss": 0.540589451789856, + "step": 6390 + }, + { + "ce_loss": 0.1891559511423111, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "distill_loss": 0.14564424753189087, + "epoch": 2.131420947298199, + "step": 6390 + }, + { + "epoch": 2.131420947298199, + "ref_ce_loss": 0.16008201241493225, + "step": 6390 + }, + { + "epoch": 2.134756504336224, + "loss": 0.5359, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "grad_norm": 2.3692429065704346, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "learning_rate": 0.0002579959832278007, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "loss": 0.8921303749084473, + "step": 6400 + }, + { + "ce_loss": 0.13139651715755463, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "distill_loss": 0.11276914924383163, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "ref_ce_loss": 0.18120615184307098, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "loss": 0.4045407772064209, + "step": 6400 + }, + { + "ce_loss": 0.19195307791233063, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "distill_loss": 0.10690625756978989, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "ref_ce_loss": 0.10556533932685852, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "loss": 0.46520286798477173, + "step": 6400 + }, + { + "ce_loss": 0.23439905047416687, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "distill_loss": 0.10840963572263718, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "ref_ce_loss": 0.08725638687610626, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "loss": 0.7589808106422424, + "step": 6400 + }, + { + "ce_loss": 0.16259820759296417, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "distill_loss": 0.1078663095831871, + "epoch": 2.134756504336224, + "step": 6400 + }, + { + "epoch": 2.134756504336224, + "ref_ce_loss": 0.1864003986120224, + "step": 6400 + }, + { + "epoch": 2.1380920613742496, + "loss": 0.5514, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "grad_norm": 4.490102767944336, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "learning_rate": 0.0002578553067972205, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "loss": 0.46829667687416077, + "step": 6410 + }, + { + "ce_loss": 0.1818871796131134, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "distill_loss": 0.11202502250671387, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "ref_ce_loss": 0.12546919286251068, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "loss": 0.48561179637908936, + "step": 6410 + }, + { + "ce_loss": 0.17383486032485962, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "distill_loss": 0.11871214956045151, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "ref_ce_loss": 0.12499058246612549, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "loss": 0.4042794108390808, + "step": 6410 + }, + { + "ce_loss": 0.13534142076969147, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "distill_loss": 0.11494524031877518, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "ref_ce_loss": 0.11009050160646439, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "loss": 0.4688982367515564, + "step": 6410 + }, + { + "ce_loss": 0.09592331945896149, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "distill_loss": 0.09164990484714508, + "epoch": 2.1380920613742496, + "step": 6410 + }, + { + "epoch": 2.1380920613742496, + "ref_ce_loss": 0.11540322005748749, + "step": 6410 + }, + { + "epoch": 2.141427618412275, + "loss": 0.5349, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "grad_norm": 6.115320682525635, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "learning_rate": 0.00025771443368110625, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "loss": 0.7359644174575806, + "step": 6420 + }, + { + "ce_loss": 0.24195481836795807, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "distill_loss": 0.11840562522411346, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "ref_ce_loss": 0.19802844524383545, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "loss": 0.4115001857280731, + "step": 6420 + }, + { + "ce_loss": 0.17016026377677917, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "distill_loss": 0.1165749728679657, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "ref_ce_loss": 0.09531422704458237, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "loss": 0.5578610301017761, + "step": 6420 + }, + { + "ce_loss": 0.20728155970573425, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "distill_loss": 0.15174588561058044, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "ref_ce_loss": 0.09736379981040955, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "loss": 0.43680068850517273, + "step": 6420 + }, + { + "ce_loss": 0.17766030132770538, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "distill_loss": 0.14595100283622742, + "epoch": 2.141427618412275, + "step": 6420 + }, + { + "epoch": 2.141427618412275, + "ref_ce_loss": 0.11311586946249008, + "step": 6420 + }, + { + "epoch": 2.1447631754503003, + "loss": 0.5044, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "grad_norm": 3.4700722694396973, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "learning_rate": 0.0002575733641363548, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "loss": 0.40244314074516296, + "step": 6430 + }, + { + "ce_loss": 0.12530314922332764, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "distill_loss": 0.09940031170845032, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "ref_ce_loss": 0.11356396228075027, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "loss": 0.5394932627677917, + "step": 6430 + }, + { + "ce_loss": 0.24598979949951172, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "distill_loss": 0.11309697479009628, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "ref_ce_loss": 0.1273350715637207, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "loss": 0.5167526602745056, + "step": 6430 + }, + { + "ce_loss": 0.16347810626029968, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "distill_loss": 0.10503201186656952, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "ref_ce_loss": 0.1429758071899414, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "loss": 0.6103532314300537, + "step": 6430 + }, + { + "ce_loss": 0.19520029425621033, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "distill_loss": 0.11522606760263443, + "epoch": 2.1447631754503003, + "step": 6430 + }, + { + "epoch": 2.1447631754503003, + "ref_ce_loss": 0.21002265810966492, + "step": 6430 + }, + { + "epoch": 2.1480987324883256, + "loss": 0.5028, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "grad_norm": 2.7797205448150635, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "learning_rate": 0.0002574320984202214, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "loss": 0.352073073387146, + "step": 6440 + }, + { + "ce_loss": 0.14698846638202667, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "distill_loss": 0.08582793176174164, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "ref_ce_loss": 0.11884086579084396, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "loss": 0.5487686991691589, + "step": 6440 + }, + { + "ce_loss": 0.16990971565246582, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "distill_loss": 0.107993483543396, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "ref_ce_loss": 0.16219596564769745, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "loss": 0.30984169244766235, + "step": 6440 + }, + { + "ce_loss": 0.0969039648771286, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "distill_loss": 0.07845834642648697, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "ref_ce_loss": 0.13372232019901276, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "loss": 0.3871748745441437, + "step": 6440 + }, + { + "ce_loss": 0.13498345017433167, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "distill_loss": 0.10390393435955048, + "epoch": 2.1480987324883256, + "step": 6440 + }, + { + "epoch": 2.1480987324883256, + "ref_ce_loss": 0.147451713681221, + "step": 6440 + }, + { + "epoch": 2.151434289526351, + "loss": 0.5577, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "grad_norm": 3.586613416671753, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "learning_rate": 0.00025729063679031896, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "loss": 0.8539671897888184, + "step": 6450 + }, + { + "ce_loss": 0.13899965584278107, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "distill_loss": 0.12739282846450806, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "ref_ce_loss": 0.13170544803142548, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "loss": 0.43330442905426025, + "step": 6450 + }, + { + "ce_loss": 0.17675459384918213, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "distill_loss": 0.1473657488822937, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "ref_ce_loss": 0.10867439955472946, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "loss": 0.34321853518486023, + "step": 6450 + }, + { + "ce_loss": 0.10612574219703674, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "distill_loss": 0.1542721539735794, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "ref_ce_loss": 0.07491093873977661, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "loss": 0.876250147819519, + "step": 6450 + }, + { + "ce_loss": 0.18352952599525452, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "distill_loss": 0.1817750781774521, + "epoch": 2.151434289526351, + "step": 6450 + }, + { + "epoch": 2.151434289526351, + "ref_ce_loss": 0.11178678274154663, + "step": 6450 + }, + { + "epoch": 2.1547698465643763, + "loss": 0.5881, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "grad_norm": 4.06648063659668, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "learning_rate": 0.0002571489795046177, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "loss": 0.8367530703544617, + "step": 6460 + }, + { + "ce_loss": 0.19520042836666107, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "distill_loss": 0.16486920416355133, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "ref_ce_loss": 0.09433402866125107, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "loss": 0.36075928807258606, + "step": 6460 + }, + { + "ce_loss": 0.11127705872058868, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "distill_loss": 0.12301945686340332, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "ref_ce_loss": 0.12597696483135223, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "loss": 0.5136455297470093, + "step": 6460 + }, + { + "ce_loss": 0.19618333876132965, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "distill_loss": 0.12617187201976776, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "ref_ce_loss": 0.1307118833065033, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "loss": 0.6558589935302734, + "step": 6460 + }, + { + "ce_loss": 0.13309955596923828, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "distill_loss": 0.12923918664455414, + "epoch": 2.1547698465643763, + "step": 6460 + }, + { + "epoch": 2.1547698465643763, + "ref_ce_loss": 0.10269004851579666, + "step": 6460 + }, + { + "epoch": 2.1581054036024017, + "loss": 0.5978, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "grad_norm": 2.748814344406128, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "learning_rate": 0.0002570071268214447, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "loss": 0.5000072717666626, + "step": 6470 + }, + { + "ce_loss": 0.19368258118629456, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "distill_loss": 0.13927975296974182, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "ref_ce_loss": 0.1153336688876152, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "loss": 0.41083213686943054, + "step": 6470 + }, + { + "ce_loss": 0.15378105640411377, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "distill_loss": 0.1380583792924881, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "ref_ce_loss": 0.07984202355146408, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "loss": 0.7085814476013184, + "step": 6470 + }, + { + "ce_loss": 0.3445759415626526, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "distill_loss": 0.20446883141994476, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "ref_ce_loss": 0.15822933614253998, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "loss": 0.5256810188293457, + "step": 6470 + }, + { + "ce_loss": 0.19576559960842133, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "distill_loss": 0.12762930989265442, + "epoch": 2.1581054036024017, + "step": 6470 + }, + { + "epoch": 2.1581054036024017, + "ref_ce_loss": 0.13251972198486328, + "step": 6470 + }, + { + "epoch": 2.161440960640427, + "loss": 0.5677, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "grad_norm": 4.109629154205322, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "learning_rate": 0.0002568650789994832, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "loss": 0.45278507471084595, + "step": 6480 + }, + { + "ce_loss": 0.13802358508110046, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "distill_loss": 0.15413391590118408, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "ref_ce_loss": 0.10151858627796173, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "loss": 0.789211094379425, + "step": 6480 + }, + { + "ce_loss": 0.24831239879131317, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "distill_loss": 0.1875370591878891, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "ref_ce_loss": 0.2326797991991043, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "loss": 0.4289882779121399, + "step": 6480 + }, + { + "ce_loss": 0.11800291389226913, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "distill_loss": 0.1267249584197998, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "ref_ce_loss": 0.12218296527862549, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "loss": 0.3204435408115387, + "step": 6480 + }, + { + "ce_loss": 0.10140528529882431, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "distill_loss": 0.1469344198703766, + "epoch": 2.161440960640427, + "step": 6480 + }, + { + "epoch": 2.161440960640427, + "ref_ce_loss": 0.07105684280395508, + "step": 6480 + }, + { + "epoch": 2.1647765176784524, + "loss": 0.5624, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "grad_norm": 2.237226724624634, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "learning_rate": 0.0002567228362977725, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "loss": 0.5324109792709351, + "step": 6490 + }, + { + "ce_loss": 0.17774225771427155, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "distill_loss": 0.1355997622013092, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "ref_ce_loss": 0.13850535452365875, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "loss": 0.3751431703567505, + "step": 6490 + }, + { + "ce_loss": 0.1006331816315651, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "distill_loss": 0.12125347554683685, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "ref_ce_loss": 0.08901651203632355, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "loss": 0.4847124218940735, + "step": 6490 + }, + { + "ce_loss": 0.1677042692899704, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "distill_loss": 0.1740044504404068, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "ref_ce_loss": 0.14263245463371277, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "loss": 0.8070560097694397, + "step": 6490 + }, + { + "ce_loss": 0.3739077150821686, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "distill_loss": 0.17787256836891174, + "epoch": 2.1647765176784524, + "step": 6490 + }, + { + "epoch": 2.1647765176784524, + "ref_ce_loss": 0.1976572871208191, + "step": 6490 + }, + { + "epoch": 2.1681120747164777, + "loss": 0.5638, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "grad_norm": 3.8014607429504395, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "learning_rate": 0.00025658039897570703, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "loss": 0.4397726356983185, + "step": 6500 + }, + { + "ce_loss": 0.1840061992406845, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "distill_loss": 0.12719230353832245, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "ref_ce_loss": 0.09979324042797089, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "loss": 0.6797133684158325, + "step": 6500 + }, + { + "ce_loss": 0.14498358964920044, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "distill_loss": 0.14739961922168732, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "ref_ce_loss": 0.1322130262851715, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "loss": 0.8477792739868164, + "step": 6500 + }, + { + "ce_loss": 0.18177065253257751, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "distill_loss": 0.14841307699680328, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "ref_ce_loss": 0.15319450199604034, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "loss": 0.4149465560913086, + "step": 6500 + }, + { + "ce_loss": 0.13384394347667694, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "distill_loss": 0.12086221575737, + "epoch": 2.1681120747164777, + "step": 6500 + }, + { + "epoch": 2.1681120747164777, + "ref_ce_loss": 0.11601506918668747, + "step": 6500 + }, + { + "epoch": 2.171447631754503, + "loss": 0.5666, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "grad_norm": 2.200901508331299, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "learning_rate": 0.0002564377672930364, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "loss": 0.3441053330898285, + "step": 6510 + }, + { + "ce_loss": 0.0955720841884613, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "distill_loss": 0.0954127162694931, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "ref_ce_loss": 0.0820605456829071, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "loss": 0.3989987373352051, + "step": 6510 + }, + { + "ce_loss": 0.16622278094291687, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "distill_loss": 0.14429330825805664, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "ref_ce_loss": 0.08820192515850067, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "loss": 0.594819962978363, + "step": 6510 + }, + { + "ce_loss": 0.2731800377368927, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "distill_loss": 0.14734132587909698, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "ref_ce_loss": 0.1282769739627838, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "loss": 1.5799360275268555, + "step": 6510 + }, + { + "ce_loss": 0.2743644118309021, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "distill_loss": 0.15681089460849762, + "epoch": 2.171447631754503, + "step": 6510 + }, + { + "epoch": 2.171447631754503, + "ref_ce_loss": 0.114822618663311, + "step": 6510 + }, + { + "epoch": 2.1747831887925284, + "loss": 0.6169, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "grad_norm": 3.009552001953125, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "learning_rate": 0.00025629494150986455, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "loss": 0.5791804790496826, + "step": 6520 + }, + { + "ce_loss": 0.1254793405532837, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "distill_loss": 0.13040056824684143, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "ref_ce_loss": 0.1082243099808693, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "loss": 0.34121212363243103, + "step": 6520 + }, + { + "ce_loss": 0.08749425411224365, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "distill_loss": 0.13595782220363617, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "ref_ce_loss": 0.07215691357851028, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "loss": 0.35587042570114136, + "step": 6520 + }, + { + "ce_loss": 0.1334792971611023, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "distill_loss": 0.11082914471626282, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "ref_ce_loss": 0.08055583387613297, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "loss": 0.40503957867622375, + "step": 6520 + }, + { + "ce_loss": 0.1451195925474167, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "distill_loss": 0.13516882061958313, + "epoch": 2.1747831887925284, + "step": 6520 + }, + { + "epoch": 2.1747831887925284, + "ref_ce_loss": 0.08957435935735703, + "step": 6520 + }, + { + "epoch": 2.1781187458305538, + "loss": 0.528, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "grad_norm": 4.192197799682617, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "learning_rate": 0.00025615192188664925, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "loss": 0.45684969425201416, + "step": 6530 + }, + { + "ce_loss": 0.17055489122867584, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "distill_loss": 0.1565455198287964, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "ref_ce_loss": 0.05911305919289589, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "loss": 0.32195115089416504, + "step": 6530 + }, + { + "ce_loss": 0.06535065174102783, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "distill_loss": 0.11212996393442154, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "ref_ce_loss": 0.09981215000152588, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "loss": 0.5088362693786621, + "step": 6530 + }, + { + "ce_loss": 0.20849275588989258, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "distill_loss": 0.16551473736763, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "ref_ce_loss": 0.13474003970623016, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "loss": 0.7871987819671631, + "step": 6530 + }, + { + "ce_loss": 0.17473362386226654, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "distill_loss": 0.1323826164007187, + "epoch": 2.1781187458305538, + "step": 6530 + }, + { + "epoch": 2.1781187458305538, + "ref_ce_loss": 0.13147905468940735, + "step": 6530 + }, + { + "epoch": 2.181454302868579, + "loss": 0.5635, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "grad_norm": 2.9585373401641846, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "learning_rate": 0.000256008708684202, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "loss": 0.5356453657150269, + "step": 6540 + }, + { + "ce_loss": 0.187409907579422, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "distill_loss": 0.14706161618232727, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "ref_ce_loss": 0.13905051350593567, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "loss": 0.4628433585166931, + "step": 6540 + }, + { + "ce_loss": 0.17631632089614868, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "distill_loss": 0.08722099661827087, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "ref_ce_loss": 0.14837141335010529, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "loss": 0.4328906834125519, + "step": 6540 + }, + { + "ce_loss": 0.1406063288450241, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "distill_loss": 0.10556124150753021, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "ref_ce_loss": 0.08157248049974442, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "loss": 0.4424014091491699, + "step": 6540 + }, + { + "ce_loss": 0.11750722676515579, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "distill_loss": 0.12028899788856506, + "epoch": 2.181454302868579, + "step": 6540 + }, + { + "epoch": 2.181454302868579, + "ref_ce_loss": 0.09090970456600189, + "step": 6540 + }, + { + "epoch": 2.1847898599066045, + "loss": 0.531, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "grad_norm": 2.208059310913086, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "learning_rate": 0.00025586530216368706, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "loss": 0.49402064085006714, + "step": 6550 + }, + { + "ce_loss": 0.18181763589382172, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "distill_loss": 0.10426479578018188, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "ref_ce_loss": 0.1457383632659912, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "loss": 0.6502156257629395, + "step": 6550 + }, + { + "ce_loss": 0.1841825395822525, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "distill_loss": 0.1110747829079628, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "ref_ce_loss": 0.09839002788066864, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "loss": 0.5076692700386047, + "step": 6550 + }, + { + "ce_loss": 0.1509997844696045, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "distill_loss": 0.1115167960524559, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "ref_ce_loss": 0.14280161261558533, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "loss": 0.4125179052352905, + "step": 6550 + }, + { + "ce_loss": 0.14968010783195496, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "distill_loss": 0.08732464909553528, + "epoch": 2.1847898599066045, + "step": 6550 + }, + { + "epoch": 2.1847898599066045, + "ref_ce_loss": 0.11727482080459595, + "step": 6550 + }, + { + "epoch": 2.18812541694463, + "loss": 0.5558, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "grad_norm": 2.251790761947632, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "learning_rate": 0.00025572170258662146, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "loss": 0.628159761428833, + "step": 6560 + }, + { + "ce_loss": 0.22396205365657806, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "distill_loss": 0.15010559558868408, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "ref_ce_loss": 0.1103786751627922, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "loss": 0.673648476600647, + "step": 6560 + }, + { + "ce_loss": 0.1863107532262802, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "distill_loss": 0.15502431988716125, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "ref_ce_loss": 0.16920864582061768, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "loss": 0.6017754077911377, + "step": 6560 + }, + { + "ce_loss": 0.19705528020858765, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "distill_loss": 0.15663552284240723, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "ref_ce_loss": 0.1611385941505432, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "loss": 0.5017340183258057, + "step": 6560 + }, + { + "ce_loss": 0.20633363723754883, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "distill_loss": 0.13032835721969604, + "epoch": 2.18812541694463, + "step": 6560 + }, + { + "epoch": 2.18812541694463, + "ref_ce_loss": 0.12185481935739517, + "step": 6560 + }, + { + "epoch": 2.191460973982655, + "loss": 0.6456, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "grad_norm": 5.231355667114258, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "learning_rate": 0.00025557791021487417, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "loss": 0.6945635080337524, + "step": 6570 + }, + { + "ce_loss": 0.13277281820774078, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "distill_loss": 0.12004784494638443, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "ref_ce_loss": 0.13293145596981049, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "loss": 0.42795026302337646, + "step": 6570 + }, + { + "ce_loss": 0.11150016635656357, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "distill_loss": 0.14841175079345703, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "ref_ce_loss": 0.12076891213655472, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "loss": 0.5713083744049072, + "step": 6570 + }, + { + "ce_loss": 0.1848202347755432, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "distill_loss": 0.17503327131271362, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "ref_ce_loss": 0.13746482133865356, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "loss": 0.4127688407897949, + "step": 6570 + }, + { + "ce_loss": 0.09293724596500397, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "distill_loss": 0.09941837191581726, + "epoch": 2.191460973982655, + "step": 6570 + }, + { + "epoch": 2.191460973982655, + "ref_ce_loss": 0.08287370204925537, + "step": 6570 + }, + { + "epoch": 2.1947965310206805, + "loss": 0.575, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "grad_norm": 2.4874107837677, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "learning_rate": 0.0002554339253106657, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "loss": 0.46032392978668213, + "step": 6580 + }, + { + "ce_loss": 0.09199360013008118, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "distill_loss": 0.1110043004155159, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "ref_ce_loss": 0.12053350359201431, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "loss": 0.2444971650838852, + "step": 6580 + }, + { + "ce_loss": 0.07341790199279785, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "distill_loss": 0.10386674851179123, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "ref_ce_loss": 0.06677841395139694, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "loss": 0.6744773387908936, + "step": 6580 + }, + { + "ce_loss": 0.19307246804237366, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "distill_loss": 0.18344935774803162, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "ref_ce_loss": 0.1400938779115677, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "loss": 0.3913784325122833, + "step": 6580 + }, + { + "ce_loss": 0.08127550780773163, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "distill_loss": 0.10964338481426239, + "epoch": 2.1947965310206805, + "step": 6580 + }, + { + "epoch": 2.1947965310206805, + "ref_ce_loss": 0.12807399034500122, + "step": 6580 + }, + { + "epoch": 2.198132088058706, + "loss": 0.5882, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "grad_norm": 2.5270981788635254, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "learning_rate": 0.00025528974813656785, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "loss": 0.4461905062198639, + "step": 6590 + }, + { + "ce_loss": 0.18007469177246094, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "distill_loss": 0.0942193865776062, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "ref_ce_loss": 0.17168787121772766, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "loss": 0.45658642053604126, + "step": 6590 + }, + { + "ce_loss": 0.16922041773796082, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "distill_loss": 0.1462121605873108, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "ref_ce_loss": 0.14055679738521576, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "loss": 0.6269693374633789, + "step": 6590 + }, + { + "ce_loss": 0.14432154595851898, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "distill_loss": 0.11544007807970047, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "ref_ce_loss": 0.09846153855323792, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "loss": 0.41132834553718567, + "step": 6590 + }, + { + "ce_loss": 0.13266809284687042, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "distill_loss": 0.11859145760536194, + "epoch": 2.198132088058706, + "step": 6590 + }, + { + "epoch": 2.198132088058706, + "ref_ce_loss": 0.1222081333398819, + "step": 6590 + }, + { + "epoch": 2.201467645096731, + "loss": 0.5259, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "grad_norm": 2.9365334510803223, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "learning_rate": 0.00025514537895550274, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "loss": 0.26942503452301025, + "step": 6600 + }, + { + "ce_loss": 0.0721149668097496, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "distill_loss": 0.09688413888216019, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "ref_ce_loss": 0.1003360003232956, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "loss": 0.39754799008369446, + "step": 6600 + }, + { + "ce_loss": 0.1665194183588028, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "distill_loss": 0.1093418300151825, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "ref_ce_loss": 0.09641645103693008, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "loss": 1.559045433998108, + "step": 6600 + }, + { + "ce_loss": 0.22894124686717987, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "distill_loss": 0.16813145577907562, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "ref_ce_loss": 0.11116345226764679, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "loss": 0.2761334478855133, + "step": 6600 + }, + { + "ce_loss": 0.06958366185426712, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "distill_loss": 0.10546497255563736, + "epoch": 2.201467645096731, + "step": 6600 + }, + { + "epoch": 2.201467645096731, + "ref_ce_loss": 0.1002037301659584, + "step": 6600 + }, + { + "epoch": 2.2048032021347566, + "loss": 0.5541, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "grad_norm": 2.531602621078491, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "learning_rate": 0.0002550008180307429, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "loss": 0.7316204309463501, + "step": 6610 + }, + { + "ce_loss": 0.18090717494487762, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "distill_loss": 0.10258594155311584, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "ref_ce_loss": 0.12114837020635605, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "loss": 0.5733076930046082, + "step": 6610 + }, + { + "ce_loss": 0.26956093311309814, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "distill_loss": 0.10860621929168701, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "ref_ce_loss": 0.13310791552066803, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "loss": 0.40223658084869385, + "step": 6610 + }, + { + "ce_loss": 0.16505920886993408, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "distill_loss": 0.10595328360795975, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "ref_ce_loss": 0.09320106357336044, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "loss": 0.36049261689186096, + "step": 6610 + }, + { + "ce_loss": 0.11396670341491699, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "distill_loss": 0.10318335890769958, + "epoch": 2.2048032021347566, + "step": 6610 + }, + { + "epoch": 2.2048032021347566, + "ref_ce_loss": 0.09993364661931992, + "step": 6610 + }, + { + "epoch": 2.208138759172782, + "loss": 0.512, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "grad_norm": 4.26342248916626, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "learning_rate": 0.0002548560656259104, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "loss": 0.47284626960754395, + "step": 6620 + }, + { + "ce_loss": 0.2442830353975296, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "distill_loss": 0.11992715299129486, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "ref_ce_loss": 0.07802657037973404, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "loss": 0.7858883142471313, + "step": 6620 + }, + { + "ce_loss": 0.15104246139526367, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "distill_loss": 0.09752034395933151, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "ref_ce_loss": 0.1495540291070938, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "loss": 0.6424992680549622, + "step": 6620 + }, + { + "ce_loss": 0.21502114832401276, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "distill_loss": 0.12338387966156006, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "ref_ce_loss": 0.14934112131595612, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "loss": 0.5512647032737732, + "step": 6620 + }, + { + "ce_loss": 0.2337639182806015, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "distill_loss": 0.11226874589920044, + "epoch": 2.208138759172782, + "step": 6620 + }, + { + "epoch": 2.208138759172782, + "ref_ce_loss": 0.1710253655910492, + "step": 6620 + }, + { + "epoch": 2.2114743162108073, + "loss": 0.5531, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "grad_norm": 1.902422308921814, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "learning_rate": 0.0002547111220049765, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "loss": 0.4596637785434723, + "step": 6630 + }, + { + "ce_loss": 0.18465951085090637, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "distill_loss": 0.12532518804073334, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "ref_ce_loss": 0.09735433012247086, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "loss": 0.6123343110084534, + "step": 6630 + }, + { + "ce_loss": 0.1756560355424881, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "distill_loss": 0.15115751326084137, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "ref_ce_loss": 0.10715075582265854, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "loss": 0.44199562072753906, + "step": 6630 + }, + { + "ce_loss": 0.16095171868801117, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "distill_loss": 0.07144710421562195, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "ref_ce_loss": 0.1259879767894745, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "loss": 0.3934045433998108, + "step": 6630 + }, + { + "ce_loss": 0.1257804036140442, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "distill_loss": 0.10377589613199234, + "epoch": 2.2114743162108073, + "step": 6630 + }, + { + "epoch": 2.2114743162108073, + "ref_ce_loss": 0.09237315505743027, + "step": 6630 + }, + { + "epoch": 2.2148098732488326, + "loss": 0.5225, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "grad_norm": 2.804863214492798, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "learning_rate": 0.00025456598743226134, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "loss": 0.5671036243438721, + "step": 6640 + }, + { + "ce_loss": 0.22472530603408813, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "distill_loss": 0.146651491522789, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "ref_ce_loss": 0.12498027831315994, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "loss": 0.51534503698349, + "step": 6640 + }, + { + "ce_loss": 0.21721240878105164, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "distill_loss": 0.14261776208877563, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "ref_ce_loss": 0.1554638296365738, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "loss": 0.625286877155304, + "step": 6640 + }, + { + "ce_loss": 0.17786265909671783, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "distill_loss": 0.16213417053222656, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "ref_ce_loss": 0.1285240203142166, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "loss": 0.7410913109779358, + "step": 6640 + }, + { + "ce_loss": 0.14669311046600342, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "distill_loss": 0.12467852979898453, + "epoch": 2.2148098732488326, + "step": 6640 + }, + { + "epoch": 2.2148098732488326, + "ref_ce_loss": 0.12662538886070251, + "step": 6640 + }, + { + "epoch": 2.218145430286858, + "loss": 0.5819, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "grad_norm": 2.284592390060425, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "learning_rate": 0.0002544206621724329, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "loss": 0.7698065042495728, + "step": 6650 + }, + { + "ce_loss": 0.36121097207069397, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "distill_loss": 0.19637200236320496, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "ref_ce_loss": 0.15686072409152985, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "loss": 0.2823468744754791, + "step": 6650 + }, + { + "ce_loss": 0.08524331450462341, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "distill_loss": 0.1248115599155426, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "ref_ce_loss": 0.0721224993467331, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "loss": 0.8946218490600586, + "step": 6650 + }, + { + "ce_loss": 0.17718909680843353, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "distill_loss": 0.14615222811698914, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "ref_ce_loss": 0.12515729665756226, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "loss": 0.40610453486442566, + "step": 6650 + }, + { + "ce_loss": 0.1060795858502388, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "distill_loss": 0.17362916469573975, + "epoch": 2.218145430286858, + "step": 6650 + }, + { + "epoch": 2.218145430286858, + "ref_ce_loss": 0.08574973046779633, + "step": 6650 + }, + { + "epoch": 2.2214809873248833, + "loss": 0.5958, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "grad_norm": 2.7552988529205322, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "learning_rate": 0.0002542751464905073, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "loss": 0.6930282115936279, + "step": 6660 + }, + { + "ce_loss": 0.27699872851371765, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "distill_loss": 0.18802866339683533, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "ref_ce_loss": 0.13296718895435333, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "loss": 0.5645378828048706, + "step": 6660 + }, + { + "ce_loss": 0.15500415861606598, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "distill_loss": 0.1263539344072342, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "ref_ce_loss": 0.13144756853580475, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "loss": 0.4319797456264496, + "step": 6660 + }, + { + "ce_loss": 0.15598873794078827, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "distill_loss": 0.1293891817331314, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "ref_ce_loss": 0.14640480279922485, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "loss": 0.3450653553009033, + "step": 6660 + }, + { + "ce_loss": 0.086029551923275, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "distill_loss": 0.1262974590063095, + "epoch": 2.2214809873248833, + "step": 6660 + }, + { + "epoch": 2.2214809873248833, + "ref_ce_loss": 0.11724057048559189, + "step": 6660 + }, + { + "epoch": 2.2248165443629087, + "loss": 0.578, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "grad_norm": 2.0899698734283447, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "learning_rate": 0.0002541294406518477, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "loss": 0.519721508026123, + "step": 6670 + }, + { + "ce_loss": 0.20661495625972748, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "distill_loss": 0.11647433042526245, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "ref_ce_loss": 0.14371584355831146, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "loss": 0.41748136281967163, + "step": 6670 + }, + { + "ce_loss": 0.0835602805018425, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "distill_loss": 0.11514532566070557, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "ref_ce_loss": 0.08703186362981796, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "loss": 0.7007561922073364, + "step": 6670 + }, + { + "ce_loss": 0.17401117086410522, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "distill_loss": 0.16014891862869263, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "ref_ce_loss": 0.11379396170377731, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "loss": 0.8025463819503784, + "step": 6670 + }, + { + "ce_loss": 0.23345544934272766, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "distill_loss": 0.15384605526924133, + "epoch": 2.2248165443629087, + "step": 6670 + }, + { + "epoch": 2.2248165443629087, + "ref_ce_loss": 0.1933191865682602, + "step": 6670 + }, + { + "epoch": 2.228152101400934, + "loss": 0.5479, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "grad_norm": 2.624807357788086, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "learning_rate": 0.0002539835449221641, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "loss": 0.8735204339027405, + "step": 6680 + }, + { + "ce_loss": 0.17503662407398224, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "distill_loss": 0.12262516468763351, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "ref_ce_loss": 0.15919841825962067, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "loss": 0.5863019227981567, + "step": 6680 + }, + { + "ce_loss": 0.12611548602581024, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "distill_loss": 0.10386236757040024, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "ref_ce_loss": 0.1623413860797882, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "loss": 0.8576542139053345, + "step": 6680 + }, + { + "ce_loss": 0.15461662411689758, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "distill_loss": 0.13622869551181793, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "ref_ce_loss": 0.09053165465593338, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "loss": 0.3839050531387329, + "step": 6680 + }, + { + "ce_loss": 0.17515422403812408, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "distill_loss": 0.12041886150836945, + "epoch": 2.228152101400934, + "step": 6680 + }, + { + "epoch": 2.228152101400934, + "ref_ce_loss": 0.0845176950097084, + "step": 6680 + }, + { + "epoch": 2.2314876584389594, + "loss": 0.5373, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "grad_norm": 3.100541591644287, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "learning_rate": 0.0002538374595675126, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "loss": 0.34606313705444336, + "step": 6690 + }, + { + "ce_loss": 0.09983520209789276, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "distill_loss": 0.09398765861988068, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "ref_ce_loss": 0.10421671718358994, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "loss": 0.46764615178108215, + "step": 6690 + }, + { + "ce_loss": 0.10423019528388977, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "distill_loss": 0.11818268150091171, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "ref_ce_loss": 0.12623147666454315, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "loss": 0.44064533710479736, + "step": 6690 + }, + { + "ce_loss": 0.13126851618289948, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "distill_loss": 0.10611870884895325, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "ref_ce_loss": 0.13785810768604279, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "loss": 0.6716941595077515, + "step": 6690 + }, + { + "ce_loss": 0.1662738025188446, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "distill_loss": 0.08780381828546524, + "epoch": 2.2314876584389594, + "step": 6690 + }, + { + "epoch": 2.2314876584389594, + "ref_ce_loss": 0.12413392215967178, + "step": 6690 + }, + { + "epoch": 2.2348232154769847, + "loss": 0.5717, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "grad_norm": 2.7917656898498535, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "learning_rate": 0.00025369118485429545, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "loss": 0.6208775639533997, + "step": 6700 + }, + { + "ce_loss": 0.3293250501155853, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "distill_loss": 0.1531747728586197, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "ref_ce_loss": 0.13826331496238708, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "loss": 0.8903650045394897, + "step": 6700 + }, + { + "ce_loss": 0.2016555815935135, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "distill_loss": 0.10579682141542435, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "ref_ce_loss": 0.14238031208515167, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "loss": 0.45686089992523193, + "step": 6700 + }, + { + "ce_loss": 0.197709321975708, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "distill_loss": 0.11980358511209488, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "ref_ce_loss": 0.13925841450691223, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "loss": 0.4481344521045685, + "step": 6700 + }, + { + "ce_loss": 0.2102845311164856, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "distill_loss": 0.11122799664735794, + "epoch": 2.2348232154769847, + "step": 6700 + }, + { + "epoch": 2.2348232154769847, + "ref_ce_loss": 0.1259424388408661, + "step": 6700 + }, + { + "epoch": 2.23815877251501, + "loss": 0.518, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "grad_norm": 2.218740463256836, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "learning_rate": 0.00025354472104926, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "loss": 0.3201616108417511, + "step": 6710 + }, + { + "ce_loss": 0.0903606191277504, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "distill_loss": 0.098169706761837, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "ref_ce_loss": 0.09702183306217194, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "loss": 0.43331289291381836, + "step": 6710 + }, + { + "ce_loss": 0.14315032958984375, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "distill_loss": 0.11144326627254486, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "ref_ce_loss": 0.1140560731291771, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "loss": 0.6270884871482849, + "step": 6710 + }, + { + "ce_loss": 0.16704252362251282, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "distill_loss": 0.1067693904042244, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "ref_ce_loss": 0.101380854845047, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "loss": 0.5825164318084717, + "step": 6710 + }, + { + "ce_loss": 0.1660555601119995, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "distill_loss": 0.11673898249864578, + "epoch": 2.23815877251501, + "step": 6710 + }, + { + "epoch": 2.23815877251501, + "ref_ce_loss": 0.10022629052400589, + "step": 6710 + }, + { + "epoch": 2.2414943295530354, + "loss": 0.5457, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "grad_norm": 3.044771909713745, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "learning_rate": 0.00025339806841949837, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "loss": 1.189454197883606, + "step": 6720 + }, + { + "ce_loss": 0.1252426952123642, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "distill_loss": 0.0986853614449501, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "ref_ce_loss": 0.09285452961921692, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "loss": 0.7065540552139282, + "step": 6720 + }, + { + "ce_loss": 0.16824042797088623, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "distill_loss": 0.12243731319904327, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "ref_ce_loss": 0.11678522825241089, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "loss": 0.6083236932754517, + "step": 6720 + }, + { + "ce_loss": 0.1929941177368164, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "distill_loss": 0.09316430985927582, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "ref_ce_loss": 0.1361846625804901, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "loss": 0.4545796811580658, + "step": 6720 + }, + { + "ce_loss": 0.18878419697284698, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "distill_loss": 0.14338716864585876, + "epoch": 2.2414943295530354, + "step": 6720 + }, + { + "epoch": 2.2414943295530354, + "ref_ce_loss": 0.09671805799007416, + "step": 6720 + }, + { + "epoch": 2.2448298865910608, + "loss": 0.6404, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "grad_norm": 3.4351155757904053, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "learning_rate": 0.0002532512272324472, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "loss": 0.4270901381969452, + "step": 6730 + }, + { + "ce_loss": 0.08665609359741211, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "distill_loss": 0.15767119824886322, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "ref_ce_loss": 0.12011439353227615, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "loss": 0.48001086711883545, + "step": 6730 + }, + { + "ce_loss": 0.13299185037612915, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "distill_loss": 0.14891919493675232, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "ref_ce_loss": 0.108582504093647, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "loss": 0.6431511640548706, + "step": 6730 + }, + { + "ce_loss": 0.14817194640636444, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "distill_loss": 0.13265648484230042, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "ref_ce_loss": 0.09085676819086075, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "loss": 0.6905701756477356, + "step": 6730 + }, + { + "ce_loss": 0.23921699821949005, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "distill_loss": 0.20255146920681, + "epoch": 2.2448298865910608, + "step": 6730 + }, + { + "epoch": 2.2448298865910608, + "ref_ce_loss": 0.19140289723873138, + "step": 6730 + }, + { + "epoch": 2.248165443629086, + "loss": 0.6584, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "grad_norm": 3.7747929096221924, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "learning_rate": 0.0002531041977558868, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "loss": 0.4672980308532715, + "step": 6740 + }, + { + "ce_loss": 0.15710696578025818, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "distill_loss": 0.1362805962562561, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "ref_ce_loss": 0.12380146235227585, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "loss": 0.8580319285392761, + "step": 6740 + }, + { + "ce_loss": 0.15320658683776855, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "distill_loss": 0.12126266956329346, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "ref_ce_loss": 0.09768229722976685, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "loss": 0.247745543718338, + "step": 6740 + }, + { + "ce_loss": 0.07998524606227875, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "distill_loss": 0.08720527589321136, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "ref_ce_loss": 0.049813829362392426, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "loss": 0.464148610830307, + "step": 6740 + }, + { + "ce_loss": 0.16646577417850494, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "distill_loss": 0.14713206887245178, + "epoch": 2.248165443629086, + "step": 6740 + }, + { + "epoch": 2.248165443629086, + "ref_ce_loss": 0.10102847218513489, + "step": 6740 + }, + { + "epoch": 2.2515010006671115, + "loss": 0.551, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "grad_norm": 2.241534948348999, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "learning_rate": 0.00025295698025794094, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "loss": 0.5242320895195007, + "step": 6750 + }, + { + "ce_loss": 0.17825943231582642, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "distill_loss": 0.14979057013988495, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "ref_ce_loss": 0.15575054287910461, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "loss": 0.4172029495239258, + "step": 6750 + }, + { + "ce_loss": 0.16601456701755524, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "distill_loss": 0.14670082926750183, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "ref_ce_loss": 0.1038890928030014, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "loss": 0.3701702356338501, + "step": 6750 + }, + { + "ce_loss": 0.1041710376739502, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "distill_loss": 0.10256893187761307, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "ref_ce_loss": 0.12215316295623779, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "loss": 0.2804224193096161, + "step": 6750 + }, + { + "ce_loss": 0.06712121516466141, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "distill_loss": 0.08440475165843964, + "epoch": 2.2515010006671115, + "step": 6750 + }, + { + "epoch": 2.2515010006671115, + "ref_ce_loss": 0.0904068648815155, + "step": 6750 + }, + { + "epoch": 2.254836557705137, + "loss": 0.537, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "grad_norm": 3.29146146774292, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "learning_rate": 0.0002528095750070764, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "loss": 0.30096712708473206, + "step": 6760 + }, + { + "ce_loss": 0.06180081143975258, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "distill_loss": 0.09566116333007812, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "ref_ce_loss": 0.09152912348508835, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "loss": 0.6212273836135864, + "step": 6760 + }, + { + "ce_loss": 0.3123440444469452, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "distill_loss": 0.12399928271770477, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "ref_ce_loss": 0.18479116261005402, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "loss": 0.45415574312210083, + "step": 6760 + }, + { + "ce_loss": 0.18255992233753204, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "distill_loss": 0.12544983625411987, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "ref_ce_loss": 0.08303073048591614, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "loss": 0.599034309387207, + "step": 6760 + }, + { + "ce_loss": 0.2385578453540802, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "distill_loss": 0.13109007477760315, + "epoch": 2.254836557705137, + "step": 6760 + }, + { + "epoch": 2.254836557705137, + "ref_ce_loss": 0.12280596792697906, + "step": 6760 + }, + { + "epoch": 2.258172114743162, + "loss": 0.5318, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "grad_norm": 3.4085757732391357, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "learning_rate": 0.00025266198227210203, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "loss": 0.31608596444129944, + "step": 6770 + }, + { + "ce_loss": 0.07763687521219254, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "distill_loss": 0.10791601240634918, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "ref_ce_loss": 0.07429070770740509, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "loss": 0.417531818151474, + "step": 6770 + }, + { + "ce_loss": 0.14198024570941925, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "distill_loss": 0.13423949480056763, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "ref_ce_loss": 0.14104554057121277, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "loss": 0.4688641428947449, + "step": 6770 + }, + { + "ce_loss": 0.21024879813194275, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "distill_loss": 0.1353413462638855, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "ref_ce_loss": 0.12314224988222122, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "loss": 0.4443889856338501, + "step": 6770 + }, + { + "ce_loss": 0.106757752597332, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "distill_loss": 0.10572335869073868, + "epoch": 2.258172114743162, + "step": 6770 + }, + { + "epoch": 2.258172114743162, + "ref_ce_loss": 0.110373854637146, + "step": 6770 + }, + { + "epoch": 2.2615076717811875, + "loss": 0.5922, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "grad_norm": 3.920358180999756, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "learning_rate": 0.0002525142023221689, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "loss": 0.2829582393169403, + "step": 6780 + }, + { + "ce_loss": 0.11151076853275299, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "distill_loss": 0.10000155121088028, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "ref_ce_loss": 0.07125628739595413, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "loss": 0.5526301264762878, + "step": 6780 + }, + { + "ce_loss": 0.17572642862796783, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "distill_loss": 0.15626123547554016, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "ref_ce_loss": 0.1614377349615097, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "loss": 0.5078950524330139, + "step": 6780 + }, + { + "ce_loss": 0.16883182525634766, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "distill_loss": 0.11712483316659927, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "ref_ce_loss": 0.1466582715511322, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "loss": 0.7580641508102417, + "step": 6780 + }, + { + "ce_loss": 0.10384313017129898, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "distill_loss": 0.10925843566656113, + "epoch": 2.2615076717811875, + "step": 6780 + }, + { + "epoch": 2.2615076717811875, + "ref_ce_loss": 0.08862993121147156, + "step": 6780 + }, + { + "epoch": 2.264843228819213, + "loss": 0.5518, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "grad_norm": 1.862863302230835, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "learning_rate": 0.0002523662354267693, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "loss": 0.2049292027950287, + "step": 6790 + }, + { + "ce_loss": 0.054800186306238174, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "distill_loss": 0.05896997079253197, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "ref_ce_loss": 0.055623188614845276, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "loss": 0.4574205279350281, + "step": 6790 + }, + { + "ce_loss": 0.16178199648857117, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "distill_loss": 0.0981992781162262, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "ref_ce_loss": 0.10306326299905777, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "loss": 0.3297522962093353, + "step": 6790 + }, + { + "ce_loss": 0.10595489293336868, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "distill_loss": 0.1075303927063942, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "ref_ce_loss": 0.0912047028541565, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "loss": 0.932381272315979, + "step": 6790 + }, + { + "ce_loss": 0.19181010127067566, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "distill_loss": 0.13706378638744354, + "epoch": 2.264843228819213, + "step": 6790 + }, + { + "epoch": 2.264843228819213, + "ref_ce_loss": 0.18505807220935822, + "step": 6790 + }, + { + "epoch": 2.268178785857238, + "loss": 0.5435, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "grad_norm": 3.101024627685547, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "learning_rate": 0.0002522180818557364, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "loss": 0.7316094040870667, + "step": 6800 + }, + { + "ce_loss": 0.12107143551111221, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "distill_loss": 0.09981100261211395, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "ref_ce_loss": 0.11426037549972534, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "loss": 0.4724943935871124, + "step": 6800 + }, + { + "ce_loss": 0.19530630111694336, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "distill_loss": 0.1164764016866684, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "ref_ce_loss": 0.12989644706249237, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "loss": 0.288299024105072, + "step": 6800 + }, + { + "ce_loss": 0.08834747225046158, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "distill_loss": 0.07737602293491364, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "ref_ce_loss": 0.12200861424207687, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "loss": 0.3896799087524414, + "step": 6800 + }, + { + "ce_loss": 0.14076721668243408, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "distill_loss": 0.09837865829467773, + "epoch": 2.268178785857238, + "step": 6800 + }, + { + "epoch": 2.268178785857238, + "ref_ce_loss": 0.10813792049884796, + "step": 6800 + }, + { + "epoch": 2.2715143428952635, + "loss": 0.5463, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "grad_norm": 3.9756486415863037, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "learning_rate": 0.00025206974187924397, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "loss": 0.7348902821540833, + "step": 6810 + }, + { + "ce_loss": 0.1007981151342392, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "distill_loss": 0.07493539154529572, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "ref_ce_loss": 0.1073109582066536, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "loss": 0.7470747232437134, + "step": 6810 + }, + { + "ce_loss": 0.2006990760564804, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "distill_loss": 0.0991901382803917, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "ref_ce_loss": 0.11322102695703506, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "loss": 0.49651744961738586, + "step": 6810 + }, + { + "ce_loss": 0.19121769070625305, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "distill_loss": 0.08532430976629257, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "ref_ce_loss": 0.1548120677471161, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "loss": 0.44100818037986755, + "step": 6810 + }, + { + "ce_loss": 0.20114682614803314, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "distill_loss": 0.13773350417613983, + "epoch": 2.2715143428952635, + "step": 6810 + }, + { + "epoch": 2.2715143428952635, + "ref_ce_loss": 0.0654730275273323, + "step": 6810 + }, + { + "epoch": 2.274849899933289, + "loss": 0.5656, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "grad_norm": 2.955838441848755, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "learning_rate": 0.0002519212157678056, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "loss": 0.5254935622215271, + "step": 6820 + }, + { + "ce_loss": 0.24336126446723938, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "distill_loss": 0.13984636962413788, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "ref_ce_loss": 0.10717921704053879, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "loss": 0.861416220664978, + "step": 6820 + }, + { + "ce_loss": 0.17388497292995453, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "distill_loss": 0.12055382877588272, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "ref_ce_loss": 0.1794699728488922, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "loss": 0.44711706042289734, + "step": 6820 + }, + { + "ce_loss": 0.21135549247264862, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "distill_loss": 0.1068238839507103, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "ref_ce_loss": 0.12878869473934174, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "loss": 0.9810287952423096, + "step": 6820 + }, + { + "ce_loss": 0.24978908896446228, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "distill_loss": 0.11645025759935379, + "epoch": 2.274849899933289, + "step": 6820 + }, + { + "epoch": 2.274849899933289, + "ref_ce_loss": 0.1504693478345871, + "step": 6820 + }, + { + "epoch": 2.2781854569713142, + "loss": 0.4938, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "grad_norm": 1.7476575374603271, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "learning_rate": 0.00025177250379227427, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "loss": 0.39564117789268494, + "step": 6830 + }, + { + "ce_loss": 0.16511234641075134, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "distill_loss": 0.1051425114274025, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "ref_ce_loss": 0.10434064269065857, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "loss": 0.4065183103084564, + "step": 6830 + }, + { + "ce_loss": 0.1354883462190628, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "distill_loss": 0.08841560781002045, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "ref_ce_loss": 0.10876308381557465, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "loss": 0.553986668586731, + "step": 6830 + }, + { + "ce_loss": 0.16752132773399353, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "distill_loss": 0.09990599751472473, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "ref_ce_loss": 0.190854012966156, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "loss": 0.6248155236244202, + "step": 6830 + }, + { + "ce_loss": 0.1778598129749298, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "distill_loss": 0.12430022656917572, + "epoch": 2.2781854569713142, + "step": 6830 + }, + { + "epoch": 2.2781854569713142, + "ref_ce_loss": 0.15459713339805603, + "step": 6830 + }, + { + "epoch": 2.2815210140093396, + "loss": 0.5051, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "grad_norm": 2.298872232437134, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "learning_rate": 0.00025162360622384204, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "loss": 0.9022257328033447, + "step": 6840 + }, + { + "ce_loss": 0.13605789840221405, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "distill_loss": 0.09870048612356186, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "ref_ce_loss": 0.1417873352766037, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "loss": 0.48904597759246826, + "step": 6840 + }, + { + "ce_loss": 0.24442356824874878, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "distill_loss": 0.10273656249046326, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "ref_ce_loss": 0.1416039913892746, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "loss": 0.4396345317363739, + "step": 6840 + }, + { + "ce_loss": 0.1511615812778473, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "distill_loss": 0.08964379131793976, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "ref_ce_loss": 0.11310078203678131, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "loss": 0.717617392539978, + "step": 6840 + }, + { + "ce_loss": 0.22391705214977264, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "distill_loss": 0.12639391422271729, + "epoch": 2.2815210140093396, + "step": 6840 + }, + { + "epoch": 2.2815210140093396, + "ref_ce_loss": 0.15380483865737915, + "step": 6840 + }, + { + "epoch": 2.284856571047365, + "loss": 0.5406, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "grad_norm": 4.39909553527832, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "learning_rate": 0.0002514745233340393, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "loss": 0.33096733689308167, + "step": 6850 + }, + { + "ce_loss": 0.06250643730163574, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "distill_loss": 0.09241592884063721, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "ref_ce_loss": 0.08213594555854797, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "loss": 0.4952237010002136, + "step": 6850 + }, + { + "ce_loss": 0.19607624411582947, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "distill_loss": 0.11687838286161423, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "ref_ce_loss": 0.13495883345603943, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "loss": 0.28248468041419983, + "step": 6850 + }, + { + "ce_loss": 0.07966279238462448, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "distill_loss": 0.08891388028860092, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "ref_ce_loss": 0.11379349231719971, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "loss": 0.6294571161270142, + "step": 6850 + }, + { + "ce_loss": 0.23358556628227234, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "distill_loss": 0.12204214185476303, + "epoch": 2.284856571047365, + "step": 6850 + }, + { + "epoch": 2.284856571047365, + "ref_ce_loss": 0.11346219480037689, + "step": 6850 + }, + { + "epoch": 2.2881921280853903, + "loss": 0.5312, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "grad_norm": 2.713907241821289, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "learning_rate": 0.0002513252553947344, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "loss": 1.2297799587249756, + "step": 6860 + }, + { + "ce_loss": 0.2428199201822281, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "distill_loss": 0.10671888291835785, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "ref_ce_loss": 0.0930713415145874, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "loss": 0.4076330065727234, + "step": 6860 + }, + { + "ce_loss": 0.1973707377910614, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "distill_loss": 0.09870386868715286, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "ref_ce_loss": 0.11129038035869598, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "loss": 0.49984055757522583, + "step": 6860 + }, + { + "ce_loss": 0.25557148456573486, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "distill_loss": 0.11275038868188858, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "ref_ce_loss": 0.1314382702112198, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "loss": 0.3817070722579956, + "step": 6860 + }, + { + "ce_loss": 0.10354137420654297, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "distill_loss": 0.09449654817581177, + "epoch": 2.2881921280853903, + "step": 6860 + }, + { + "epoch": 2.2881921280853903, + "ref_ce_loss": 0.1289808601140976, + "step": 6860 + }, + { + "epoch": 2.2915276851234156, + "loss": 0.613, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "grad_norm": 4.802642345428467, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "learning_rate": 0.00025117580267813324, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "loss": 0.40103641152381897, + "step": 6870 + }, + { + "ce_loss": 0.12699085474014282, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "distill_loss": 0.10826943814754486, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "ref_ce_loss": 0.11575514823198318, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "loss": 0.5134308934211731, + "step": 6870 + }, + { + "ce_loss": 0.17653977870941162, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "distill_loss": 0.10092929750680923, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "ref_ce_loss": 0.12954755127429962, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "loss": 0.49596887826919556, + "step": 6870 + }, + { + "ce_loss": 0.12782566249370575, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "distill_loss": 0.0934700220823288, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "ref_ce_loss": 0.11930210143327713, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "loss": 0.4235532283782959, + "step": 6870 + }, + { + "ce_loss": 0.1441873013973236, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "distill_loss": 0.13626454770565033, + "epoch": 2.2915276851234156, + "step": 6870 + }, + { + "epoch": 2.2915276851234156, + "ref_ce_loss": 0.09318730980157852, + "step": 6870 + }, + { + "epoch": 2.294863242161441, + "loss": 0.5244, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "grad_norm": 1.8829163312911987, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "learning_rate": 0.00025102616545677855, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "loss": 0.550043523311615, + "step": 6880 + }, + { + "ce_loss": 0.2702171802520752, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "distill_loss": 0.11289297789335251, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "ref_ce_loss": 0.1368609368801117, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "loss": 0.532550573348999, + "step": 6880 + }, + { + "ce_loss": 0.1481907218694687, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "distill_loss": 0.1028796136379242, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "ref_ce_loss": 0.12922383844852448, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "loss": 0.41644102334976196, + "step": 6880 + }, + { + "ce_loss": 0.17733660340309143, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "distill_loss": 0.09103229641914368, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "ref_ce_loss": 0.14790113270282745, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "loss": 0.3858429789543152, + "step": 6880 + }, + { + "ce_loss": 0.11549221724271774, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "distill_loss": 0.10008694231510162, + "epoch": 2.294863242161441, + "step": 6880 + }, + { + "epoch": 2.294863242161441, + "ref_ce_loss": 0.10730014741420746, + "step": 6880 + }, + { + "epoch": 2.2981987991994663, + "loss": 0.5339, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "grad_norm": 3.124227523803711, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "learning_rate": 0.0002508763440035497, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "loss": 0.4120410978794098, + "step": 6890 + }, + { + "ce_loss": 0.17701852321624756, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "distill_loss": 0.11152602732181549, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "ref_ce_loss": 0.0911896601319313, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "loss": 0.4871509373188019, + "step": 6890 + }, + { + "ce_loss": 0.20323820412158966, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "distill_loss": 0.11872399598360062, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "ref_ce_loss": 0.16509409248828888, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "loss": 0.8271893262863159, + "step": 6890 + }, + { + "ce_loss": 0.20623229444026947, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "distill_loss": 0.17105066776275635, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "ref_ce_loss": 0.11689022928476334, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "loss": 0.5279237627983093, + "step": 6890 + }, + { + "ce_loss": 0.2500387728214264, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "distill_loss": 0.1337040662765503, + "epoch": 2.2981987991994663, + "step": 6890 + }, + { + "epoch": 2.2981987991994663, + "ref_ce_loss": 0.10777636617422104, + "step": 6890 + }, + { + "epoch": 2.3015343562374917, + "loss": 0.5284, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "grad_norm": 1.9892547130584717, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "learning_rate": 0.0002507263385916618, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "loss": 0.33847102522850037, + "step": 6900 + }, + { + "ce_loss": 0.07629029452800751, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "distill_loss": 0.09815974533557892, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "ref_ce_loss": 0.11757113039493561, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "loss": 0.8411189317703247, + "step": 6900 + }, + { + "ce_loss": 0.17542950809001923, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "distill_loss": 0.12596750259399414, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "ref_ce_loss": 0.0883040726184845, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "loss": 0.6303671598434448, + "step": 6900 + }, + { + "ce_loss": 0.07354290038347244, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "distill_loss": 0.08964565396308899, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "ref_ce_loss": 0.11973276734352112, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "loss": 0.9342833757400513, + "step": 6900 + }, + { + "ce_loss": 0.3235514163970947, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "distill_loss": 0.10573019087314606, + "epoch": 2.3015343562374917, + "step": 6900 + }, + { + "epoch": 2.3015343562374917, + "ref_ce_loss": 0.21635988354682922, + "step": 6900 + }, + { + "epoch": 2.304869913275517, + "loss": 0.547, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "grad_norm": 3.8128700256347656, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "learning_rate": 0.00025057614949466564, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "loss": 0.38203656673431396, + "step": 6910 + }, + { + "ce_loss": 0.15076223015785217, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "distill_loss": 0.11055965721607208, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "ref_ce_loss": 0.11993446946144104, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "loss": 0.3910497725009918, + "step": 6910 + }, + { + "ce_loss": 0.1450098156929016, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "distill_loss": 0.08259563148021698, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "ref_ce_loss": 0.11308462172746658, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "loss": 0.4037795066833496, + "step": 6910 + }, + { + "ce_loss": 0.14874710142612457, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "distill_loss": 0.094327911734581, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "ref_ce_loss": 0.06298214197158813, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "loss": 0.3290916681289673, + "step": 6910 + }, + { + "ce_loss": 0.12153016775846481, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "distill_loss": 0.09355287253856659, + "epoch": 2.304869913275517, + "step": 6910 + }, + { + "epoch": 2.304869913275517, + "ref_ce_loss": 0.0676640197634697, + "step": 6910 + }, + { + "epoch": 2.3082054703135424, + "loss": 0.5028, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "grad_norm": 2.110680341720581, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "learning_rate": 0.0002504257769864468, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "loss": 0.35967057943344116, + "step": 6920 + }, + { + "ce_loss": 0.10454410314559937, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "distill_loss": 0.0831480473279953, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "ref_ce_loss": 0.1044725775718689, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "loss": 0.4940726161003113, + "step": 6920 + }, + { + "ce_loss": 0.1524488776922226, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "distill_loss": 0.10413416475057602, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "ref_ce_loss": 0.14821214973926544, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "loss": 0.6797365546226501, + "step": 6920 + }, + { + "ce_loss": 0.2348724603652954, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "distill_loss": 0.17091615498065948, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "ref_ce_loss": 0.1520233303308487, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "loss": 0.6118256449699402, + "step": 6920 + }, + { + "ce_loss": 0.2456093281507492, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "distill_loss": 0.1130257397890091, + "epoch": 2.3082054703135424, + "step": 6920 + }, + { + "epoch": 2.3082054703135424, + "ref_ce_loss": 0.1817878931760788, + "step": 6920 + }, + { + "epoch": 2.3115410273515677, + "loss": 0.5564, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "grad_norm": 5.056583881378174, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "learning_rate": 0.0002502752213412255, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "loss": 0.5287309288978577, + "step": 6930 + }, + { + "ce_loss": 0.22509720921516418, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "distill_loss": 0.12229382991790771, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "ref_ce_loss": 0.14213591814041138, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "loss": 0.3925534188747406, + "step": 6930 + }, + { + "ce_loss": 0.1103292927145958, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "distill_loss": 0.10397907346487045, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "ref_ce_loss": 0.09071511775255203, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "loss": 0.7502164840698242, + "step": 6930 + }, + { + "ce_loss": 0.25732266902923584, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "distill_loss": 0.13244201242923737, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "ref_ce_loss": 0.16759437322616577, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "loss": 0.42582589387893677, + "step": 6930 + }, + { + "ce_loss": 0.13769946992397308, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "distill_loss": 0.08645257353782654, + "epoch": 2.3115410273515677, + "step": 6930 + }, + { + "epoch": 2.3115410273515677, + "ref_ce_loss": 0.1043008342385292, + "step": 6930 + }, + { + "epoch": 2.314876584389593, + "loss": 0.533, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "grad_norm": 5.912909507751465, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "learning_rate": 0.00025012448283355586, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "loss": 0.4608493745326996, + "step": 6940 + }, + { + "ce_loss": 0.13830618560314178, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "distill_loss": 0.1606249064207077, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "ref_ce_loss": 0.13436095416545868, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "loss": 0.3904031217098236, + "step": 6940 + }, + { + "ce_loss": 0.14121873676776886, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "distill_loss": 0.18405036628246307, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "ref_ce_loss": 0.06506030261516571, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "loss": 0.8715546131134033, + "step": 6940 + }, + { + "ce_loss": 0.2018638551235199, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "distill_loss": 0.20629999041557312, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "ref_ce_loss": 0.11235152184963226, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "loss": 0.5256648063659668, + "step": 6940 + }, + { + "ce_loss": 0.22348742187023163, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "distill_loss": 0.20976266264915466, + "epoch": 2.314876584389593, + "step": 6940 + }, + { + "epoch": 2.314876584389593, + "ref_ce_loss": 0.09235940873622894, + "step": 6940 + }, + { + "epoch": 2.3182121414276184, + "loss": 0.5821, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "grad_norm": 2.4662444591522217, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "learning_rate": 0.00024997356173832536, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "loss": 0.61894690990448, + "step": 6950 + }, + { + "ce_loss": 0.24030759930610657, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "distill_loss": 0.1709204614162445, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "ref_ce_loss": 0.14262616634368896, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "loss": 0.7224009037017822, + "step": 6950 + }, + { + "ce_loss": 0.1713646501302719, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "distill_loss": 0.21908612549304962, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "ref_ce_loss": 0.12334079295396805, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "loss": 0.6632697582244873, + "step": 6950 + }, + { + "ce_loss": 0.1673959642648697, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "distill_loss": 0.19471748173236847, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "ref_ce_loss": 0.12122780829668045, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "loss": 0.8324764966964722, + "step": 6950 + }, + { + "ce_loss": 0.24417831003665924, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "distill_loss": 0.18310905992984772, + "epoch": 2.3182121414276184, + "step": 6950 + }, + { + "epoch": 2.3182121414276184, + "ref_ce_loss": 0.17036187648773193, + "step": 6950 + }, + { + "epoch": 2.321547698465644, + "loss": 0.5725, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "grad_norm": 4.33769416809082, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "learning_rate": 0.00024982245833075466, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "loss": 0.6916789412498474, + "step": 6960 + }, + { + "ce_loss": 0.23741775751113892, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "distill_loss": 0.17516985535621643, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "ref_ce_loss": 0.1561703383922577, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "loss": 0.4892004132270813, + "step": 6960 + }, + { + "ce_loss": 0.10765889286994934, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "distill_loss": 0.1549256592988968, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "ref_ce_loss": 0.07869797199964523, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "loss": 0.5315861701965332, + "step": 6960 + }, + { + "ce_loss": 0.23685623705387115, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "distill_loss": 0.16759350895881653, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "ref_ce_loss": 0.12687110900878906, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "loss": 0.9347847700119019, + "step": 6960 + }, + { + "ce_loss": 0.32063230872154236, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "distill_loss": 0.20522774755954742, + "epoch": 2.321547698465644, + "step": 6960 + }, + { + "epoch": 2.321547698465644, + "ref_ce_loss": 0.14673230051994324, + "step": 6960 + }, + { + "epoch": 2.324883255503669, + "loss": 0.5492, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "grad_norm": 2.2744076251983643, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "learning_rate": 0.0002496711728863967, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "loss": 0.7938024997711182, + "step": 6970 + }, + { + "ce_loss": 0.3438526391983032, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "distill_loss": 0.18260395526885986, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "ref_ce_loss": 0.13226082921028137, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "loss": 0.48433953523635864, + "step": 6970 + }, + { + "ce_loss": 0.06481018662452698, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "distill_loss": 0.1196662038564682, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "ref_ce_loss": 0.09014244377613068, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "loss": 0.7640020847320557, + "step": 6970 + }, + { + "ce_loss": 0.1803368777036667, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "distill_loss": 0.1577882617712021, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "ref_ce_loss": 0.11870657652616501, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "loss": 0.482220321893692, + "step": 6970 + }, + { + "ce_loss": 0.16063043475151062, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "distill_loss": 0.11836622655391693, + "epoch": 2.324883255503669, + "step": 6970 + }, + { + "epoch": 2.324883255503669, + "ref_ce_loss": 0.1077476367354393, + "step": 6970 + }, + { + "epoch": 2.3282188125416945, + "loss": 0.5672, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "grad_norm": 1.7576720714569092, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "learning_rate": 0.00024951970568113643, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "loss": 0.4736265540122986, + "step": 6980 + }, + { + "ce_loss": 0.19416333734989166, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "distill_loss": 0.08972302824258804, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "ref_ce_loss": 0.1895930916070938, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "loss": 0.84056156873703, + "step": 6980 + }, + { + "ce_loss": 0.1776997596025467, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "distill_loss": 0.10737963765859604, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "ref_ce_loss": 0.20549514889717102, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "loss": 0.40008676052093506, + "step": 6980 + }, + { + "ce_loss": 0.17353692650794983, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "distill_loss": 0.11199227720499039, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "ref_ce_loss": 0.06525639444589615, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "loss": 0.5162920355796814, + "step": 6980 + }, + { + "ce_loss": 0.1759594827890396, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "distill_loss": 0.08586210012435913, + "epoch": 2.3282188125416945, + "step": 6980 + }, + { + "epoch": 2.3282188125416945, + "ref_ce_loss": 0.1958712935447693, + "step": 6980 + }, + { + "epoch": 2.33155436957972, + "loss": 0.5367, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "grad_norm": 2.3340952396392822, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "learning_rate": 0.00024936805699119033, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "loss": 0.5874610543251038, + "step": 6990 + }, + { + "ce_loss": 0.24022972583770752, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "distill_loss": 0.12990692257881165, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "ref_ce_loss": 0.13573411107063293, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "loss": 0.683862030506134, + "step": 6990 + }, + { + "ce_loss": 0.19738270342350006, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "distill_loss": 0.12065930664539337, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "ref_ce_loss": 0.13016332685947418, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "loss": 0.5245134830474854, + "step": 6990 + }, + { + "ce_loss": 0.1756822019815445, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "distill_loss": 0.09732089936733246, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "ref_ce_loss": 0.11447296291589737, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "loss": 0.43875250220298767, + "step": 6990 + }, + { + "ce_loss": 0.23195013403892517, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "distill_loss": 0.10867423564195633, + "epoch": 2.33155436957972, + "step": 6990 + }, + { + "epoch": 2.33155436957972, + "ref_ce_loss": 0.09810131043195724, + "step": 6990 + }, + { + "epoch": 2.334889926617745, + "loss": 0.5477, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "grad_norm": 5.084808826446533, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "learning_rate": 0.0002492162270931058, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "loss": 0.6767491102218628, + "step": 7000 + }, + { + "ce_loss": 0.1351834088563919, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "distill_loss": 0.11761026084423065, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "ref_ce_loss": 0.08548547327518463, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "loss": 0.35615816712379456, + "step": 7000 + }, + { + "ce_loss": 0.06495611369609833, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "distill_loss": 0.10093344748020172, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "ref_ce_loss": 0.12442238628864288, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "loss": 1.0504748821258545, + "step": 7000 + }, + { + "ce_loss": 0.5850064158439636, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "distill_loss": 0.11385861039161682, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "ref_ce_loss": 0.28474873304367065, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "loss": 0.4212246537208557, + "step": 7000 + }, + { + "ce_loss": 0.16298897564411163, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "distill_loss": 0.11466117203235626, + "epoch": 2.334889926617745, + "step": 7000 + }, + { + "epoch": 2.334889926617745, + "ref_ce_loss": 0.11506110429763794, + "step": 7000 + }, + { + "epoch": 2.3382254836557705, + "loss": 0.4893, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "grad_norm": 3.7350194454193115, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "learning_rate": 0.0002490642162637606, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "loss": 0.49059048295021057, + "step": 7010 + }, + { + "ce_loss": 0.18139110505580902, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "distill_loss": 0.10281254351139069, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "ref_ce_loss": 0.12374398857355118, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "loss": 0.5319090485572815, + "step": 7010 + }, + { + "ce_loss": 0.19318652153015137, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "distill_loss": 0.10822658240795135, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "ref_ce_loss": 0.10500287264585495, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "loss": 0.5998363494873047, + "step": 7010 + }, + { + "ce_loss": 0.15518829226493835, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "distill_loss": 0.08122570812702179, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "ref_ce_loss": 0.08441562205553055, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "loss": 0.4391767084598541, + "step": 7010 + }, + { + "ce_loss": 0.1472361832857132, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "distill_loss": 0.09130523353815079, + "epoch": 2.3382254836557705, + "step": 7010 + }, + { + "epoch": 2.3382254836557705, + "ref_ce_loss": 0.12980183959007263, + "step": 7010 + }, + { + "epoch": 2.341561040693796, + "loss": 0.55, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "grad_norm": 2.3400723934173584, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "learning_rate": 0.00024891202478036266, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "loss": 0.5382486581802368, + "step": 7020 + }, + { + "ce_loss": 0.20204707980155945, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "distill_loss": 0.11083859950304031, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "ref_ce_loss": 0.10628888756036758, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "loss": 1.0087568759918213, + "step": 7020 + }, + { + "ce_loss": 0.20493319630622864, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "distill_loss": 0.11655318737030029, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "ref_ce_loss": 0.13127465546131134, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "loss": 0.3462040424346924, + "step": 7020 + }, + { + "ce_loss": 0.11205989122390747, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "distill_loss": 0.09971515089273453, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "ref_ce_loss": 0.095881387591362, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "loss": 0.5632637739181519, + "step": 7020 + }, + { + "ce_loss": 0.09938246756792068, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "distill_loss": 0.11057905852794647, + "epoch": 2.341561040693796, + "step": 7020 + }, + { + "epoch": 2.341561040693796, + "ref_ce_loss": 0.08367249369621277, + "step": 7020 + }, + { + "epoch": 2.3448965977318212, + "loss": 0.5794, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "grad_norm": 2.2300267219543457, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "learning_rate": 0.0002487596529204491, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "loss": 0.34646064043045044, + "step": 7030 + }, + { + "ce_loss": 0.10655689984560013, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "distill_loss": 0.07144635915756226, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "ref_ce_loss": 0.11395888030529022, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "loss": 0.9182996153831482, + "step": 7030 + }, + { + "ce_loss": 0.12896713614463806, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "distill_loss": 0.12328529357910156, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "ref_ce_loss": 0.1499665230512619, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "loss": 0.796744167804718, + "step": 7030 + }, + { + "ce_loss": 0.11686861515045166, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "distill_loss": 0.11429211497306824, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "ref_ce_loss": 0.12650713324546814, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "loss": 0.5051373243331909, + "step": 7030 + }, + { + "ce_loss": 0.19313965737819672, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "distill_loss": 0.08210955560207367, + "epoch": 2.3448965977318212, + "step": 7030 + }, + { + "epoch": 2.3448965977318212, + "ref_ce_loss": 0.12427300959825516, + "step": 7030 + }, + { + "epoch": 2.3482321547698466, + "loss": 0.5902, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "grad_norm": 2.497122287750244, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "learning_rate": 0.0002486071009618861, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "loss": 0.436869740486145, + "step": 7040 + }, + { + "ce_loss": 0.11832589656114578, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "distill_loss": 0.11758086085319519, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "ref_ce_loss": 0.13616949319839478, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "loss": 0.6059623956680298, + "step": 7040 + }, + { + "ce_loss": 0.24056315422058105, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "distill_loss": 0.1205277070403099, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "ref_ce_loss": 0.1293945163488388, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "loss": 1.240486979484558, + "step": 7040 + }, + { + "ce_loss": 0.18717877566814423, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "distill_loss": 0.13339264690876007, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "ref_ce_loss": 0.10200013965368271, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "loss": 0.49349045753479004, + "step": 7040 + }, + { + "ce_loss": 0.23079393804073334, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "distill_loss": 0.14471150934696198, + "epoch": 2.3482321547698466, + "step": 7040 + }, + { + "epoch": 2.3482321547698466, + "ref_ce_loss": 0.11784505099058151, + "step": 7040 + }, + { + "epoch": 2.351567711807872, + "loss": 0.4947, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "grad_norm": 2.4697394371032715, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "learning_rate": 0.0002484543691828683, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "loss": 0.8933064341545105, + "step": 7050 + }, + { + "ce_loss": 0.1610158532857895, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "distill_loss": 0.11047893017530441, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "ref_ce_loss": 0.15877987444400787, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "loss": 0.3495582044124603, + "step": 7050 + }, + { + "ce_loss": 0.13080823421478271, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "distill_loss": 0.10535383224487305, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "ref_ce_loss": 0.11328636109828949, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "loss": 0.6516172885894775, + "step": 7050 + }, + { + "ce_loss": 0.19208601117134094, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "distill_loss": 0.11426350474357605, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "ref_ce_loss": 0.11293771117925644, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "loss": 0.7531253695487976, + "step": 7050 + }, + { + "ce_loss": 0.20007860660552979, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "distill_loss": 0.11392208933830261, + "epoch": 2.351567711807872, + "step": 7050 + }, + { + "epoch": 2.351567711807872, + "ref_ce_loss": 0.09171842783689499, + "step": 7050 + }, + { + "epoch": 2.3549032688458973, + "loss": 0.5329, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "grad_norm": 3.29000186920166, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "learning_rate": 0.0002483014578619181, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "loss": 0.5926487445831299, + "step": 7060 + }, + { + "ce_loss": 0.2161916345357895, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "distill_loss": 0.11993826925754547, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "ref_ce_loss": 0.09713123738765717, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "loss": 0.5170358419418335, + "step": 7060 + }, + { + "ce_loss": 0.1522398144006729, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "distill_loss": 0.10683247447013855, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "ref_ce_loss": 0.10169428586959839, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "loss": 0.33274805545806885, + "step": 7060 + }, + { + "ce_loss": 0.12017644196748734, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "distill_loss": 0.10068129003047943, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "ref_ce_loss": 0.11175446957349777, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "loss": 0.5639784932136536, + "step": 7060 + }, + { + "ce_loss": 0.19975608587265015, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "distill_loss": 0.13790909945964813, + "epoch": 2.3549032688458973, + "step": 7060 + }, + { + "epoch": 2.3549032688458973, + "ref_ce_loss": 0.17795276641845703, + "step": 7060 + }, + { + "epoch": 2.3582388258839226, + "loss": 0.5653, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "grad_norm": 11.505758285522461, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "learning_rate": 0.00024814836727788563, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "loss": 1.0071794986724854, + "step": 7070 + }, + { + "ce_loss": 0.23054859042167664, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "distill_loss": 0.217034250497818, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "ref_ce_loss": 0.13613423705101013, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "loss": 0.5255445241928101, + "step": 7070 + }, + { + "ce_loss": 0.16354890167713165, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "distill_loss": 0.1455252468585968, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "ref_ce_loss": 0.13496388494968414, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "loss": 0.45980530977249146, + "step": 7070 + }, + { + "ce_loss": 0.16896405816078186, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "distill_loss": 0.13719318807125092, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "ref_ce_loss": 0.09889309853315353, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "loss": 0.6259099841117859, + "step": 7070 + }, + { + "ce_loss": 0.2755209505558014, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "distill_loss": 0.16722777485847473, + "epoch": 2.3582388258839226, + "step": 7070 + }, + { + "epoch": 2.3582388258839226, + "ref_ce_loss": 0.1830323040485382, + "step": 7070 + }, + { + "epoch": 2.361574382921948, + "loss": 0.6363, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "grad_norm": 2.382572889328003, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "learning_rate": 0.0002479950977099476, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "loss": 0.547733724117279, + "step": 7080 + }, + { + "ce_loss": 0.15202975273132324, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "distill_loss": 0.12852467596530914, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "ref_ce_loss": 0.10482831299304962, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "loss": 1.1188392639160156, + "step": 7080 + }, + { + "ce_loss": 0.15732690691947937, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "distill_loss": 0.13412557542324066, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "ref_ce_loss": 0.12009337544441223, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "loss": 0.551268994808197, + "step": 7080 + }, + { + "ce_loss": 0.12435823678970337, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "distill_loss": 0.10511200875043869, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "ref_ce_loss": 0.09536106884479523, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "loss": 0.5037704706192017, + "step": 7080 + }, + { + "ce_loss": 0.20579804480075836, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "distill_loss": 0.15192145109176636, + "epoch": 2.361574382921948, + "step": 7080 + }, + { + "epoch": 2.361574382921948, + "ref_ce_loss": 0.10141074657440186, + "step": 7080 + }, + { + "epoch": 2.3649099399599733, + "loss": 0.5554, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "grad_norm": 2.5210964679718018, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "learning_rate": 0.0002478416494376072, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "loss": 0.5080283284187317, + "step": 7090 + }, + { + "ce_loss": 0.2142353057861328, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "distill_loss": 0.12848493456840515, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "ref_ce_loss": 0.13930000364780426, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "loss": 0.4127797484397888, + "step": 7090 + }, + { + "ce_loss": 0.18228977918624878, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "distill_loss": 0.12166300415992737, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "ref_ce_loss": 0.10858716070652008, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "loss": 1.189202070236206, + "step": 7090 + }, + { + "ce_loss": 0.25980982184410095, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "distill_loss": 0.17335101962089539, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "ref_ce_loss": 0.17701305449008942, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "loss": 0.4164826273918152, + "step": 7090 + }, + { + "ce_loss": 0.15672534704208374, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "distill_loss": 0.11506892740726471, + "epoch": 2.3649099399599733, + "step": 7090 + }, + { + "epoch": 2.3649099399599733, + "ref_ce_loss": 0.09737000614404678, + "step": 7090 + }, + { + "epoch": 2.3682454969979987, + "loss": 0.5592, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "grad_norm": 3.242518186569214, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "learning_rate": 0.00024768802274069364, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "loss": 0.8438761234283447, + "step": 7100 + }, + { + "ce_loss": 0.11892778426408768, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "distill_loss": 0.14269644021987915, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "ref_ce_loss": 0.09960228204727173, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "loss": 0.5341728925704956, + "step": 7100 + }, + { + "ce_loss": 0.13597016036510468, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "distill_loss": 0.11470731347799301, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "ref_ce_loss": 0.16889235377311707, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "loss": 0.4424186646938324, + "step": 7100 + }, + { + "ce_loss": 0.10149525851011276, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "distill_loss": 0.11648110300302505, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "ref_ce_loss": 0.09637293964624405, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "loss": 1.2790297269821167, + "step": 7100 + }, + { + "ce_loss": 0.29890012741088867, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "distill_loss": 0.15040647983551025, + "epoch": 2.3682454969979987, + "step": 7100 + }, + { + "epoch": 2.3682454969979987, + "ref_ce_loss": 0.11567104607820511, + "step": 7100 + }, + { + "epoch": 2.371581054036024, + "loss": 0.5256, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "grad_norm": 2.799344062805176, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "learning_rate": 0.0002475342178993614, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "loss": 0.5514856576919556, + "step": 7110 + }, + { + "ce_loss": 0.2613866627216339, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "distill_loss": 0.11943652480840683, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "ref_ce_loss": 0.10984734445810318, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "loss": 0.6448547840118408, + "step": 7110 + }, + { + "ce_loss": 0.19715182483196259, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "distill_loss": 0.11577840894460678, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "ref_ce_loss": 0.09529617428779602, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "loss": 0.7144370079040527, + "step": 7110 + }, + { + "ce_loss": 0.1511741429567337, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "distill_loss": 0.11388468742370605, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "ref_ce_loss": 0.11055812239646912, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "loss": 0.3908351957798004, + "step": 7110 + }, + { + "ce_loss": 0.1625000536441803, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "distill_loss": 0.10220208019018173, + "epoch": 2.371581054036024, + "step": 7110 + }, + { + "epoch": 2.371581054036024, + "ref_ce_loss": 0.08996450901031494, + "step": 7110 + }, + { + "epoch": 2.3749166110740494, + "loss": 0.5628, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "grad_norm": 3.208664655685425, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "learning_rate": 0.00024738023519408985, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "loss": 0.3598056137561798, + "step": 7120 + }, + { + "ce_loss": 0.0844772681593895, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "distill_loss": 0.1388201117515564, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "ref_ce_loss": 0.13615448772907257, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "loss": 0.4420519471168518, + "step": 7120 + }, + { + "ce_loss": 0.13381193578243256, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "distill_loss": 0.11907447874546051, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "ref_ce_loss": 0.11265262961387634, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "loss": 0.31493082642555237, + "step": 7120 + }, + { + "ce_loss": 0.10242314636707306, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "distill_loss": 0.10008740425109863, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "ref_ce_loss": 0.11232121288776398, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "loss": 0.5138975977897644, + "step": 7120 + }, + { + "ce_loss": 0.16858409345149994, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "distill_loss": 0.1288112848997116, + "epoch": 2.3749166110740494, + "step": 7120 + }, + { + "epoch": 2.3749166110740494, + "ref_ce_loss": 0.12751555442810059, + "step": 7120 + }, + { + "epoch": 2.3782521681120747, + "loss": 0.5237, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "grad_norm": 2.2312326431274414, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "learning_rate": 0.00024722607490568264, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "loss": 0.34697839617729187, + "step": 7130 + }, + { + "ce_loss": 0.09735045582056046, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "distill_loss": 0.11878188699483871, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "ref_ce_loss": 0.09604649245738983, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "loss": 0.5361130833625793, + "step": 7130 + }, + { + "ce_loss": 0.22890308499336243, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "distill_loss": 0.12308313697576523, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "ref_ce_loss": 0.1419752985239029, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "loss": 0.47083860635757446, + "step": 7130 + }, + { + "ce_loss": 0.22402635216712952, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "distill_loss": 0.12542489171028137, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "ref_ce_loss": 0.12129999697208405, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "loss": 0.4552367031574249, + "step": 7130 + }, + { + "ce_loss": 0.09671945869922638, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "distill_loss": 0.12107788771390915, + "epoch": 2.3782521681120747, + "step": 7130 + }, + { + "epoch": 2.3782521681120747, + "ref_ce_loss": 0.05718206241726875, + "step": 7130 + }, + { + "epoch": 2.3815877251501, + "loss": 0.5365, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "grad_norm": 2.714935541152954, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "learning_rate": 0.00024707173731526735, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "loss": 0.5343358516693115, + "step": 7140 + }, + { + "ce_loss": 0.16792216897010803, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "distill_loss": 0.1140684261918068, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "ref_ce_loss": 0.11957409977912903, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "loss": 0.5848384499549866, + "step": 7140 + }, + { + "ce_loss": 0.12534525990486145, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "distill_loss": 0.1129332184791565, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "ref_ce_loss": 0.1090116947889328, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "loss": 0.5402489900588989, + "step": 7140 + }, + { + "ce_loss": 0.19635345041751862, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "distill_loss": 0.12364673614501953, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "ref_ce_loss": 0.10902047902345657, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "loss": 0.46899038553237915, + "step": 7140 + }, + { + "ce_loss": 0.13804945349693298, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "distill_loss": 0.1111493930220604, + "epoch": 2.3815877251501, + "step": 7140 + }, + { + "epoch": 2.3815877251501, + "ref_ce_loss": 0.14478346705436707, + "step": 7140 + }, + { + "epoch": 2.3849232821881254, + "loss": 0.5629, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "grad_norm": 6.925165176391602, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "learning_rate": 0.0002469172227042948, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "loss": 0.48434075713157654, + "step": 7150 + }, + { + "ce_loss": 0.13119007647037506, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "distill_loss": 0.15414267778396606, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "ref_ce_loss": 0.14164388179779053, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "loss": 0.5125703811645508, + "step": 7150 + }, + { + "ce_loss": 0.08586469292640686, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "distill_loss": 0.12737464904785156, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "ref_ce_loss": 0.16618117690086365, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "loss": 0.6230001449584961, + "step": 7150 + }, + { + "ce_loss": 0.19951888918876648, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "distill_loss": 0.18361909687519073, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "ref_ce_loss": 0.10332217812538147, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "loss": 1.4790210723876953, + "step": 7150 + }, + { + "ce_loss": 0.33115363121032715, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "distill_loss": 0.19230781495571136, + "epoch": 2.3849232821881254, + "step": 7150 + }, + { + "epoch": 2.3849232821881254, + "ref_ce_loss": 0.16792990267276764, + "step": 7150 + }, + { + "epoch": 2.388258839226151, + "loss": 0.616, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "grad_norm": 4.0847601890563965, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "learning_rate": 0.0002467625313545389, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "loss": 0.611827552318573, + "step": 7160 + }, + { + "ce_loss": 0.2245769500732422, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "distill_loss": 0.16543962061405182, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "ref_ce_loss": 0.15356603264808655, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "loss": 0.45121482014656067, + "step": 7160 + }, + { + "ce_loss": 0.15090312063694, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "distill_loss": 0.17984391748905182, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "ref_ce_loss": 0.09603415429592133, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "loss": 0.5079750418663025, + "step": 7160 + }, + { + "ce_loss": 0.11780863255262375, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "distill_loss": 0.10958898812532425, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "ref_ce_loss": 0.10029244422912598, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "loss": 0.5058857202529907, + "step": 7160 + }, + { + "ce_loss": 0.1748339980840683, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "distill_loss": 0.11158843338489532, + "epoch": 2.388258839226151, + "step": 7160 + }, + { + "epoch": 2.388258839226151, + "ref_ce_loss": 0.13546571135520935, + "step": 7160 + }, + { + "epoch": 2.391594396264176, + "loss": 0.5656, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "grad_norm": 7.381030559539795, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "learning_rate": 0.00024660766354809546, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "loss": 0.48350441455841064, + "step": 7170 + }, + { + "ce_loss": 0.1267509013414383, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "distill_loss": 0.1259116232395172, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "ref_ce_loss": 0.10241179913282394, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "loss": 0.6579767465591431, + "step": 7170 + }, + { + "ce_loss": 0.17158272862434387, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "distill_loss": 0.14517442882061005, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "ref_ce_loss": 0.11714108288288116, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "loss": 0.47211408615112305, + "step": 7170 + }, + { + "ce_loss": 0.21960081160068512, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "distill_loss": 0.13582301139831543, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "ref_ce_loss": 0.11660350859165192, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "loss": 0.3569020926952362, + "step": 7170 + }, + { + "ce_loss": 0.14226363599300385, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "distill_loss": 0.12439102679491043, + "epoch": 2.391594396264176, + "step": 7170 + }, + { + "epoch": 2.391594396264176, + "ref_ce_loss": 0.09003002196550369, + "step": 7170 + }, + { + "epoch": 2.3949299533022015, + "loss": 0.5258, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "grad_norm": 2.013179063796997, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "learning_rate": 0.00024645261956738224, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "loss": 0.6626637578010559, + "step": 7180 + }, + { + "ce_loss": 0.1727692186832428, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "distill_loss": 0.10929200053215027, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "ref_ce_loss": 0.11079227924346924, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "loss": 0.4409700632095337, + "step": 7180 + }, + { + "ce_loss": 0.14989428222179413, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "distill_loss": 0.10095194727182388, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "ref_ce_loss": 0.08986108005046844, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "loss": 0.44294336438179016, + "step": 7180 + }, + { + "ce_loss": 0.17101377248764038, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "distill_loss": 0.12090830504894257, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "ref_ce_loss": 0.11498505622148514, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "loss": 0.7781896591186523, + "step": 7180 + }, + { + "ce_loss": 0.12104091793298721, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "distill_loss": 0.09193304926156998, + "epoch": 2.3949299533022015, + "step": 7180 + }, + { + "epoch": 2.3949299533022015, + "ref_ce_loss": 0.11385586112737656, + "step": 7180 + }, + { + "epoch": 2.398265510340227, + "loss": 0.6057, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "grad_norm": 4.252617835998535, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "learning_rate": 0.00024629739969513845, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "loss": 0.5514110326766968, + "step": 7190 + }, + { + "ce_loss": 0.09623024612665176, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "distill_loss": 0.13418076932430267, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "ref_ce_loss": 0.09687874466180801, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "loss": 0.8390406370162964, + "step": 7190 + }, + { + "ce_loss": 0.20159050822257996, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "distill_loss": 0.13292783498764038, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "ref_ce_loss": 0.1463102549314499, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "loss": 0.7187709808349609, + "step": 7190 + }, + { + "ce_loss": 0.1949305534362793, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "distill_loss": 0.12101311981678009, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "ref_ce_loss": 0.12897959351539612, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "loss": 0.7193197011947632, + "step": 7190 + }, + { + "ce_loss": 0.16670188307762146, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "distill_loss": 0.14500631392002106, + "epoch": 2.398265510340227, + "step": 7190 + }, + { + "epoch": 2.398265510340227, + "ref_ce_loss": 0.12325281649827957, + "step": 7190 + }, + { + "epoch": 2.401601067378252, + "loss": 0.6018, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "grad_norm": 3.247999668121338, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "learning_rate": 0.00024614200421442387, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "loss": 0.5324526429176331, + "step": 7200 + }, + { + "ce_loss": 0.11869847774505615, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "distill_loss": 0.11210515350103378, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "ref_ce_loss": 0.12298411130905151, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "loss": 0.5949093103408813, + "step": 7200 + }, + { + "ce_loss": 0.07167255133390427, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "distill_loss": 0.11562314629554749, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "ref_ce_loss": 0.1257380098104477, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "loss": 0.7308820486068726, + "step": 7200 + }, + { + "ce_loss": 0.21154363453388214, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "distill_loss": 0.11605453491210938, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "ref_ce_loss": 0.14783884584903717, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "loss": 0.7168588638305664, + "step": 7200 + }, + { + "ce_loss": 0.2147577702999115, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "distill_loss": 0.1553606241941452, + "epoch": 2.401601067378252, + "step": 7200 + }, + { + "epoch": 2.401601067378252, + "ref_ce_loss": 0.17037084698677063, + "step": 7200 + }, + { + "epoch": 2.4049366244162775, + "loss": 0.5372, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "grad_norm": 2.400362014770508, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "learning_rate": 0.0002459864334086185, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "loss": 0.6147167086601257, + "step": 7210 + }, + { + "ce_loss": 0.21412178874015808, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "distill_loss": 0.13666468858718872, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "ref_ce_loss": 0.20343522727489471, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "loss": 0.4731769263744354, + "step": 7210 + }, + { + "ce_loss": 0.11004534363746643, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "distill_loss": 0.11246877908706665, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "ref_ce_loss": 0.18797901272773743, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "loss": 0.4884134531021118, + "step": 7210 + }, + { + "ce_loss": 0.1199505552649498, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "distill_loss": 0.1352088749408722, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "ref_ce_loss": 0.08366453647613525, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "loss": 0.3768484890460968, + "step": 7210 + }, + { + "ce_loss": 0.13738639652729034, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "distill_loss": 0.1332009732723236, + "epoch": 2.4049366244162775, + "step": 7210 + }, + { + "epoch": 2.4049366244162775, + "ref_ce_loss": 0.07930707186460495, + "step": 7210 + }, + { + "epoch": 2.408272181454303, + "loss": 0.5629, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "grad_norm": 2.687861204147339, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "learning_rate": 0.0002458306875614221, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "loss": 0.5342181921005249, + "step": 7220 + }, + { + "ce_loss": 0.19010506570339203, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "distill_loss": 0.143671452999115, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "ref_ce_loss": 0.20026570558547974, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "loss": 0.4170488715171814, + "step": 7220 + }, + { + "ce_loss": 0.1819777935743332, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "distill_loss": 0.1289082020521164, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "ref_ce_loss": 0.10599298775196075, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "loss": 0.9120379686355591, + "step": 7220 + }, + { + "ce_loss": 0.18983766436576843, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "distill_loss": 0.17872394621372223, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "ref_ce_loss": 0.11700882762670517, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "loss": 0.4734853506088257, + "step": 7220 + }, + { + "ce_loss": 0.13527773320674896, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "distill_loss": 0.14905977249145508, + "epoch": 2.408272181454303, + "step": 7220 + }, + { + "epoch": 2.408272181454303, + "ref_ce_loss": 0.11568107455968857, + "step": 7220 + }, + { + "epoch": 2.4116077384923282, + "loss": 0.5457, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "grad_norm": 2.3142776489257812, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "learning_rate": 0.0002456747669568538, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "loss": 0.7524964213371277, + "step": 7230 + }, + { + "ce_loss": 0.30550333857536316, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "distill_loss": 0.15317846834659576, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "ref_ce_loss": 0.1951562613248825, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "loss": 0.5358716249465942, + "step": 7230 + }, + { + "ce_loss": 0.14135171473026276, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "distill_loss": 0.13840961456298828, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "ref_ce_loss": 0.14255258440971375, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "loss": 0.994782567024231, + "step": 7230 + }, + { + "ce_loss": 0.2172781229019165, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "distill_loss": 0.15493077039718628, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "ref_ce_loss": 0.124776691198349, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "loss": 0.3869795799255371, + "step": 7230 + }, + { + "ce_loss": 0.1280602514743805, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "distill_loss": 0.13424427807331085, + "epoch": 2.4116077384923282, + "step": 7230 + }, + { + "epoch": 2.4116077384923282, + "ref_ce_loss": 0.09938256442546844, + "step": 7230 + }, + { + "epoch": 2.4149432955303536, + "loss": 0.6535, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "grad_norm": 3.7801764011383057, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "learning_rate": 0.00024551867187925114, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "loss": 1.0940275192260742, + "step": 7240 + }, + { + "ce_loss": 0.30816763639450073, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "distill_loss": 0.16756215691566467, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "ref_ce_loss": 0.1721998006105423, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "loss": 0.5963624119758606, + "step": 7240 + }, + { + "ce_loss": 0.18392635881900787, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "distill_loss": 0.14534255862236023, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "ref_ce_loss": 0.13652925193309784, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "loss": 0.8176337480545044, + "step": 7240 + }, + { + "ce_loss": 0.14677022397518158, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "distill_loss": 0.15364260971546173, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "ref_ce_loss": 0.14605900645256042, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "loss": 0.5201737880706787, + "step": 7240 + }, + { + "ce_loss": 0.14001545310020447, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "distill_loss": 0.21256887912750244, + "epoch": 2.4149432955303536, + "step": 7240 + }, + { + "epoch": 2.4149432955303536, + "ref_ce_loss": 0.11998400092124939, + "step": 7240 + }, + { + "epoch": 2.418278852568379, + "loss": 0.6528, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "grad_norm": 3.017970085144043, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "learning_rate": 0.00024536240261327003, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "loss": 0.7078180313110352, + "step": 7250 + }, + { + "ce_loss": 0.2540886402130127, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "distill_loss": 0.12883436679840088, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "ref_ce_loss": 0.1449848860502243, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "loss": 0.5828919410705566, + "step": 7250 + }, + { + "ce_loss": 0.16208089888095856, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "distill_loss": 0.1052861288189888, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "ref_ce_loss": 0.10545025765895844, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "loss": 0.5507915019989014, + "step": 7250 + }, + { + "ce_loss": 0.1334887444972992, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "distill_loss": 0.1257612407207489, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "ref_ce_loss": 0.1570345163345337, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "loss": 0.6326401233673096, + "step": 7250 + }, + { + "ce_loss": 0.2364356815814972, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "distill_loss": 0.13967809081077576, + "epoch": 2.418278852568379, + "step": 7250 + }, + { + "epoch": 2.418278852568379, + "ref_ce_loss": 0.12372446805238724, + "step": 7250 + }, + { + "epoch": 2.4216144096064043, + "loss": 0.6028, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "grad_norm": 4.875604629516602, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "learning_rate": 0.0002452059594438839, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "loss": 0.4415472745895386, + "step": 7260 + }, + { + "ce_loss": 0.18437181413173676, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "distill_loss": 0.13511112332344055, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "ref_ce_loss": 0.08003831654787064, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "loss": 0.37648123502731323, + "step": 7260 + }, + { + "ce_loss": 0.09765855222940445, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "distill_loss": 0.10940150916576385, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "ref_ce_loss": 0.08033854514360428, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "loss": 0.6172460913658142, + "step": 7260 + }, + { + "ce_loss": 0.16564592719078064, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "distill_loss": 0.1328679919242859, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "ref_ce_loss": 0.1384454071521759, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "loss": 0.43410724401474, + "step": 7260 + }, + { + "ce_loss": 0.14599348604679108, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "distill_loss": 0.1227763220667839, + "epoch": 2.4216144096064043, + "step": 7260 + }, + { + "epoch": 2.4216144096064043, + "ref_ce_loss": 0.12593898177146912, + "step": 7260 + }, + { + "epoch": 2.4249499666444296, + "loss": 0.5562, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "grad_norm": 4.715060710906982, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "learning_rate": 0.00024504934265638347, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "loss": 0.49528905749320984, + "step": 7270 + }, + { + "ce_loss": 0.211774080991745, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "distill_loss": 0.1500503122806549, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "ref_ce_loss": 0.13327829539775848, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "loss": 0.7034370303153992, + "step": 7270 + }, + { + "ce_loss": 0.21872128546237946, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "distill_loss": 0.13050577044487, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "ref_ce_loss": 0.1425921618938446, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "loss": 0.4694666266441345, + "step": 7270 + }, + { + "ce_loss": 0.13093678653240204, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "distill_loss": 0.11353382468223572, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "ref_ce_loss": 0.11853688210248947, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "loss": 0.6298027038574219, + "step": 7270 + }, + { + "ce_loss": 0.15209899842739105, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "distill_loss": 0.11965539306402206, + "epoch": 2.4249499666444296, + "step": 7270 + }, + { + "epoch": 2.4249499666444296, + "ref_ce_loss": 0.10618112981319427, + "step": 7270 + }, + { + "epoch": 2.428285523682455, + "loss": 0.5977, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "grad_norm": 2.8958613872528076, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "learning_rate": 0.000244892552536376, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "loss": 0.879882276058197, + "step": 7280 + }, + { + "ce_loss": 0.21199427545070648, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "distill_loss": 0.12943725287914276, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "ref_ce_loss": 0.12772376835346222, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "loss": 0.5259045362472534, + "step": 7280 + }, + { + "ce_loss": 0.13685457408428192, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "distill_loss": 0.11776725947856903, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "ref_ce_loss": 0.11824677139520645, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "loss": 1.1360373497009277, + "step": 7280 + }, + { + "ce_loss": 0.20049446821212769, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "distill_loss": 0.1447339951992035, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "ref_ce_loss": 0.15620869398117065, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "loss": 0.2793632745742798, + "step": 7280 + }, + { + "ce_loss": 0.09599223732948303, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "distill_loss": 0.09701315313577652, + "epoch": 2.428285523682455, + "step": 7280 + }, + { + "epoch": 2.428285523682455, + "ref_ce_loss": 0.0862441137433052, + "step": 7280 + }, + { + "epoch": 2.4316210807204803, + "loss": 0.5863, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "grad_norm": 2.2033097743988037, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "learning_rate": 0.0002447355893697847, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "loss": 0.41150063276290894, + "step": 7290 + }, + { + "ce_loss": 0.10379496961832047, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "distill_loss": 0.08479201048612595, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "ref_ce_loss": 0.15390710532665253, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "loss": 0.44651439785957336, + "step": 7290 + }, + { + "ce_loss": 0.1582019180059433, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "distill_loss": 0.08264210820198059, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "ref_ce_loss": 0.09824780374765396, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "loss": 0.736008882522583, + "step": 7290 + }, + { + "ce_loss": 0.1359715312719345, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "distill_loss": 0.11413221061229706, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "ref_ce_loss": 0.14620079100131989, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "loss": 0.6262930631637573, + "step": 7290 + }, + { + "ce_loss": 0.2595474123954773, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "distill_loss": 0.12630505859851837, + "epoch": 2.4316210807204803, + "step": 7290 + }, + { + "epoch": 2.4316210807204803, + "ref_ce_loss": 0.16754832863807678, + "step": 7290 + }, + { + "epoch": 2.4349566377585057, + "loss": 0.5062, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "grad_norm": 4.805685997009277, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "learning_rate": 0.00024457845344284855, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "loss": 0.8836522698402405, + "step": 7300 + }, + { + "ce_loss": 0.24709133803844452, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "distill_loss": 0.15019738674163818, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "ref_ce_loss": 0.15257512032985687, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "loss": 0.40954825282096863, + "step": 7300 + }, + { + "ce_loss": 0.12927237153053284, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "distill_loss": 0.10678776353597641, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "ref_ce_loss": 0.14493626356124878, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "loss": 1.000192642211914, + "step": 7300 + }, + { + "ce_loss": 0.23533885180950165, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "distill_loss": 0.16133449971675873, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "ref_ce_loss": 0.1425640732049942, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "loss": 0.519072949886322, + "step": 7300 + }, + { + "ce_loss": 0.07202491909265518, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "distill_loss": 0.1196460872888565, + "epoch": 2.4349566377585057, + "step": 7300 + }, + { + "epoch": 2.4349566377585057, + "ref_ce_loss": 0.09123296290636063, + "step": 7300 + }, + { + "epoch": 2.438292194796531, + "loss": 0.5642, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "grad_norm": 2.3900463581085205, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "learning_rate": 0.0002444211450421214, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "loss": 0.5913569927215576, + "step": 7310 + }, + { + "ce_loss": 0.236644446849823, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "distill_loss": 0.14004617929458618, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "ref_ce_loss": 0.11488953977823257, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "loss": 0.7740991115570068, + "step": 7310 + }, + { + "ce_loss": 0.2927137315273285, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "distill_loss": 0.13568630814552307, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "ref_ce_loss": 0.17373815178871155, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "loss": 0.47589385509490967, + "step": 7310 + }, + { + "ce_loss": 0.09717685729265213, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "distill_loss": 0.13696280121803284, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "ref_ce_loss": 0.09850306063890457, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "loss": 0.6440833806991577, + "step": 7310 + }, + { + "ce_loss": 0.1265231966972351, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "distill_loss": 0.13772481679916382, + "epoch": 2.438292194796531, + "step": 7310 + }, + { + "epoch": 2.438292194796531, + "ref_ce_loss": 0.08583074063062668, + "step": 7310 + }, + { + "epoch": 2.4416277518345564, + "loss": 0.5549, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "grad_norm": 3.4240992069244385, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "learning_rate": 0.00024426366445447185, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "loss": 0.738955557346344, + "step": 7320 + }, + { + "ce_loss": 0.23381483554840088, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "distill_loss": 0.17107410728931427, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "ref_ce_loss": 0.15074104070663452, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "loss": 0.6326438188552856, + "step": 7320 + }, + { + "ce_loss": 0.225467249751091, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "distill_loss": 0.15438255667686462, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "ref_ce_loss": 0.11623335629701614, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "loss": 0.40885937213897705, + "step": 7320 + }, + { + "ce_loss": 0.12698811292648315, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "distill_loss": 0.130739226937294, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "ref_ce_loss": 0.08082147687673569, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "loss": 0.30342355370521545, + "step": 7320 + }, + { + "ce_loss": 0.10416149348020554, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "distill_loss": 0.10678577423095703, + "epoch": 2.4416277518345564, + "step": 7320 + }, + { + "epoch": 2.4416277518345564, + "ref_ce_loss": 0.09228299558162689, + "step": 7320 + }, + { + "epoch": 2.4449633088725817, + "loss": 0.5681, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "grad_norm": 1.6141737699508667, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "learning_rate": 0.00024410601196708236, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "loss": 0.8059208989143372, + "step": 7330 + }, + { + "ce_loss": 0.22274239361286163, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "distill_loss": 0.14510369300842285, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "ref_ce_loss": 0.21629805862903595, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "loss": 0.5609763860702515, + "step": 7330 + }, + { + "ce_loss": 0.12866266071796417, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "distill_loss": 0.09237472712993622, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "ref_ce_loss": 0.07987705618143082, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "loss": 0.5934087634086609, + "step": 7330 + }, + { + "ce_loss": 0.23136092722415924, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "distill_loss": 0.12402576208114624, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "ref_ce_loss": 0.1395598202943802, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "loss": 0.40078234672546387, + "step": 7330 + }, + { + "ce_loss": 0.1275785267353058, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "distill_loss": 0.12972263991832733, + "epoch": 2.4449633088725817, + "step": 7330 + }, + { + "epoch": 2.4449633088725817, + "ref_ce_loss": 0.11326507478952408, + "step": 7330 + }, + { + "epoch": 2.448298865910607, + "loss": 0.5538, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "grad_norm": 3.5968334674835205, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "learning_rate": 0.0002439481878674488, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "loss": 0.6532106995582581, + "step": 7340 + }, + { + "ce_loss": 0.21292003989219666, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "distill_loss": 0.1319594830274582, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "ref_ce_loss": 0.10543256253004074, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "loss": 0.4424779713153839, + "step": 7340 + }, + { + "ce_loss": 0.14241373538970947, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "distill_loss": 0.1377052217721939, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "ref_ce_loss": 0.09160786122083664, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "loss": 0.4995424449443817, + "step": 7340 + }, + { + "ce_loss": 0.11426243185997009, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "distill_loss": 0.11159699410200119, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "ref_ce_loss": 0.14085379242897034, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "loss": 0.7496430277824402, + "step": 7340 + }, + { + "ce_loss": 0.15991029143333435, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "distill_loss": 0.12658250331878662, + "epoch": 2.448298865910607, + "step": 7340 + }, + { + "epoch": 2.448298865910607, + "ref_ce_loss": 0.11923353374004364, + "step": 7340 + }, + { + "epoch": 2.4516344229486324, + "loss": 0.5659, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "grad_norm": 2.7785444259643555, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "learning_rate": 0.00024379019244338007, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "loss": 0.6230776906013489, + "step": 7350 + }, + { + "ce_loss": 0.20530030131340027, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "distill_loss": 0.1313774734735489, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "ref_ce_loss": 0.15010467171669006, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "loss": 0.5327169299125671, + "step": 7350 + }, + { + "ce_loss": 0.13175390660762787, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "distill_loss": 0.11843869090080261, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "ref_ce_loss": 0.11874283850193024, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "loss": 0.5524466037750244, + "step": 7350 + }, + { + "ce_loss": 0.17544156312942505, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "distill_loss": 0.10477293282747269, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "ref_ce_loss": 0.126258984208107, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "loss": 0.23935043811798096, + "step": 7350 + }, + { + "ce_loss": 0.0685420110821724, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "distill_loss": 0.10156445950269699, + "epoch": 2.4516344229486324, + "step": 7350 + }, + { + "epoch": 2.4516344229486324, + "ref_ce_loss": 0.06903655081987381, + "step": 7350 + }, + { + "epoch": 2.454969979986658, + "loss": 0.588, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "grad_norm": 4.6122355461120605, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "learning_rate": 0.00024363202598299755, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "loss": 0.460933655500412, + "step": 7360 + }, + { + "ce_loss": 0.1626943051815033, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "distill_loss": 0.1440202295780182, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "ref_ce_loss": 0.1275286227464676, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "loss": 0.45748433470726013, + "step": 7360 + }, + { + "ce_loss": 0.14533036947250366, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "distill_loss": 0.1288251280784607, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "ref_ce_loss": 0.13756705820560455, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "loss": 0.7040202021598816, + "step": 7360 + }, + { + "ce_loss": 0.20019203424453735, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "distill_loss": 0.1584950089454651, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "ref_ce_loss": 0.11588919907808304, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "loss": 0.7524511218070984, + "step": 7360 + }, + { + "ce_loss": 0.1585523635149002, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "distill_loss": 0.17898641526699066, + "epoch": 2.454969979986658, + "step": 7360 + }, + { + "epoch": 2.454969979986658, + "ref_ce_loss": 0.09428147971630096, + "step": 7360 + }, + { + "epoch": 2.458305537024683, + "loss": 0.5598, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "grad_norm": 8.720335960388184, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "learning_rate": 0.00024347368877473448, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "loss": 0.4737747311592102, + "step": 7370 + }, + { + "ce_loss": 0.12464018166065216, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "distill_loss": 0.24916736781597137, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "ref_ce_loss": 0.07362619787454605, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "loss": 0.5169686079025269, + "step": 7370 + }, + { + "ce_loss": 0.13986755907535553, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "distill_loss": 0.18237434327602386, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "ref_ce_loss": 0.08128245919942856, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "loss": 0.71217280626297, + "step": 7370 + }, + { + "ce_loss": 0.06464303284883499, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "distill_loss": 0.10259261727333069, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "ref_ce_loss": 0.08970564603805542, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "loss": 1.0477197170257568, + "step": 7370 + }, + { + "ce_loss": 0.1423245221376419, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "distill_loss": 0.17700180411338806, + "epoch": 2.458305537024683, + "step": 7370 + }, + { + "epoch": 2.458305537024683, + "ref_ce_loss": 0.1467050313949585, + "step": 7370 + }, + { + "epoch": 2.4616410940627085, + "loss": 0.9109, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "grad_norm": 8.063809394836426, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "learning_rate": 0.00024331518110733545, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "loss": 0.5266667008399963, + "step": 7380 + }, + { + "ce_loss": 0.0833396315574646, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "distill_loss": 0.29295581579208374, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "ref_ce_loss": 0.1029653325676918, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "loss": 1.089504361152649, + "step": 7380 + }, + { + "ce_loss": 0.1683725267648697, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "distill_loss": 0.7212741374969482, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "ref_ce_loss": 0.15166783332824707, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "loss": 1.0741162300109863, + "step": 7380 + }, + { + "ce_loss": 0.262071818113327, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "distill_loss": 0.4402065873146057, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "ref_ce_loss": 0.06969785690307617, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "loss": 0.5918949246406555, + "step": 7380 + }, + { + "ce_loss": 0.07457996159791946, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "distill_loss": 0.38692402839660645, + "epoch": 2.4616410940627085, + "step": 7380 + }, + { + "epoch": 2.4616410940627085, + "ref_ce_loss": 0.08364200592041016, + "step": 7380 + }, + { + "epoch": 2.464976651100734, + "loss": 0.6996, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "grad_norm": 2.4126689434051514, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "learning_rate": 0.00024315650326985595, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "loss": 1.2089097499847412, + "step": 7390 + }, + { + "ce_loss": 0.18091559410095215, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "distill_loss": 0.41868704557418823, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "ref_ce_loss": 0.13473795354366302, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "loss": 0.7378758192062378, + "step": 7390 + }, + { + "ce_loss": 0.20848192274570465, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "distill_loss": 0.22565500438213348, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "ref_ce_loss": 0.1735285222530365, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "loss": 0.8218880891799927, + "step": 7390 + }, + { + "ce_loss": 0.2554478049278259, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "distill_loss": 0.24467813968658447, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "ref_ce_loss": 0.16238157451152802, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "loss": 0.6124982833862305, + "step": 7390 + }, + { + "ce_loss": 0.18273940682411194, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "distill_loss": 0.20060420036315918, + "epoch": 2.464976651100734, + "step": 7390 + }, + { + "epoch": 2.464976651100734, + "ref_ce_loss": 0.15322014689445496, + "step": 7390 + }, + { + "epoch": 2.468312208138759, + "loss": 0.6726, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "grad_norm": 3.222508192062378, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "learning_rate": 0.00024299765555166162, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "loss": 0.5121899247169495, + "step": 7400 + }, + { + "ce_loss": 0.15992094576358795, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "distill_loss": 0.17074273526668549, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "ref_ce_loss": 0.1101238951086998, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "loss": 0.6162878274917603, + "step": 7400 + }, + { + "ce_loss": 0.24921391904354095, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "distill_loss": 0.16301168501377106, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "ref_ce_loss": 0.13484197854995728, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "loss": 0.518480122089386, + "step": 7400 + }, + { + "ce_loss": 0.11398441344499588, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "distill_loss": 0.1706867665052414, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "ref_ce_loss": 0.09749495983123779, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "loss": 1.0171473026275635, + "step": 7400 + }, + { + "ce_loss": 0.14315776526927948, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "distill_loss": 0.17418572306632996, + "epoch": 2.468312208138759, + "step": 7400 + }, + { + "epoch": 2.468312208138759, + "ref_ce_loss": 0.11513806879520416, + "step": 7400 + }, + { + "epoch": 2.4716477651767845, + "loss": 0.5981, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "grad_norm": 3.1181561946868896, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "learning_rate": 0.00024283863824242825, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "loss": 0.4941348731517792, + "step": 7410 + }, + { + "ce_loss": 0.16458964347839355, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "distill_loss": 0.1792687326669693, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "ref_ce_loss": 0.14990025758743286, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "loss": 0.6867243647575378, + "step": 7410 + }, + { + "ce_loss": 0.2279725968837738, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "distill_loss": 0.27562469244003296, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "ref_ce_loss": 0.1251329928636551, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "loss": 0.5477474331855774, + "step": 7410 + }, + { + "ce_loss": 0.22149042785167694, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "distill_loss": 0.16655884683132172, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "ref_ce_loss": 0.11619669944047928, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "loss": 0.7532987594604492, + "step": 7410 + }, + { + "ce_loss": 0.17848901450634003, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "distill_loss": 0.4082738757133484, + "epoch": 2.4716477651767845, + "step": 7410 + }, + { + "epoch": 2.4716477651767845, + "ref_ce_loss": 0.12353993207216263, + "step": 7410 + }, + { + "epoch": 2.47498332221481, + "loss": 0.6411, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "grad_norm": 4.742797374725342, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "learning_rate": 0.0002426794516321405, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "loss": 0.5106517672538757, + "step": 7420 + }, + { + "ce_loss": 0.14942465722560883, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "distill_loss": 0.15792769193649292, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "ref_ce_loss": 0.1275612860918045, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "loss": 0.5815585255622864, + "step": 7420 + }, + { + "ce_loss": 0.08623402565717697, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "distill_loss": 0.14738768339157104, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "ref_ce_loss": 0.12640981376171112, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "loss": 0.7946469783782959, + "step": 7420 + }, + { + "ce_loss": 0.2270994633436203, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "distill_loss": 0.25851958990097046, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "ref_ce_loss": 0.13658231496810913, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "loss": 0.5645567774772644, + "step": 7420 + }, + { + "ce_loss": 0.11589132994413376, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "distill_loss": 0.3163776099681854, + "epoch": 2.47498332221481, + "step": 7420 + }, + { + "epoch": 2.47498332221481, + "ref_ce_loss": 0.13185791671276093, + "step": 7420 + }, + { + "epoch": 2.4783188792528352, + "loss": 0.6218, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "grad_norm": 3.2630066871643066, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "learning_rate": 0.00024252009601109206, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "loss": 1.7518689632415771, + "step": 7430 + }, + { + "ce_loss": 0.21943138539791107, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "distill_loss": 0.16195166110992432, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "ref_ce_loss": 0.13848480582237244, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "loss": 0.42734628915786743, + "step": 7430 + }, + { + "ce_loss": 0.13677547872066498, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "distill_loss": 0.14099811017513275, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "ref_ce_loss": 0.09117099642753601, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "loss": 0.49946892261505127, + "step": 7430 + }, + { + "ce_loss": 0.11629904061555862, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "distill_loss": 0.1060149148106575, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "ref_ce_loss": 0.10538594424724579, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "loss": 0.5545316934585571, + "step": 7430 + }, + { + "ce_loss": 0.17956024408340454, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "distill_loss": 0.16018567979335785, + "epoch": 2.4783188792528352, + "step": 7430 + }, + { + "epoch": 2.4783188792528352, + "ref_ce_loss": 0.11891523748636246, + "step": 7430 + }, + { + "epoch": 2.4816544362908606, + "loss": 0.5819, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "grad_norm": 2.5894501209259033, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "learning_rate": 0.0002423605716698847, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "loss": 0.462877482175827, + "step": 7440 + }, + { + "ce_loss": 0.21872608363628387, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "distill_loss": 0.15316803753376007, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "ref_ce_loss": 0.09063727408647537, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "loss": 0.49275457859039307, + "step": 7440 + }, + { + "ce_loss": 0.16110378503799438, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "distill_loss": 0.16137196123600006, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "ref_ce_loss": 0.08860895782709122, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "loss": 0.5795860886573792, + "step": 7440 + }, + { + "ce_loss": 0.150924414396286, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "distill_loss": 0.2513508200645447, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "ref_ce_loss": 0.1376534402370453, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "loss": 0.39724642038345337, + "step": 7440 + }, + { + "ce_loss": 0.14655473828315735, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "distill_loss": 0.12975682318210602, + "epoch": 2.4816544362908606, + "step": 7440 + }, + { + "epoch": 2.4816544362908606, + "ref_ce_loss": 0.12082649022340775, + "step": 7440 + }, + { + "epoch": 2.484989993328886, + "loss": 0.6028, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "grad_norm": 2.79844331741333, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "learning_rate": 0.00024220087889942793, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "loss": 0.46565479040145874, + "step": 7450 + }, + { + "ce_loss": 0.16986431181430817, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "distill_loss": 0.12018732726573944, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "ref_ce_loss": 0.17548438906669617, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "loss": 0.6156365871429443, + "step": 7450 + }, + { + "ce_loss": 0.14211425185203552, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "distill_loss": 0.3125298321247101, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "ref_ce_loss": 0.111871138215065, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "loss": 0.9736812114715576, + "step": 7450 + }, + { + "ce_loss": 0.2266884446144104, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "distill_loss": 0.2732122242450714, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "ref_ce_loss": 0.16810236871242523, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "loss": 0.8139933347702026, + "step": 7450 + }, + { + "ce_loss": 0.19329741597175598, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "distill_loss": 0.17764835059642792, + "epoch": 2.484989993328886, + "step": 7450 + }, + { + "epoch": 2.484989993328886, + "ref_ce_loss": 0.16515131294727325, + "step": 7450 + }, + { + "epoch": 2.4883255503669113, + "loss": 0.5792, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "grad_norm": 2.714251756668091, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "learning_rate": 0.00024204101799093824, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "loss": 0.569564700126648, + "step": 7460 + }, + { + "ce_loss": 0.17240485548973083, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "distill_loss": 0.1441771388053894, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "ref_ce_loss": 0.10838434845209122, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "loss": 0.6830180287361145, + "step": 7460 + }, + { + "ce_loss": 0.2065476030111313, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "distill_loss": 0.2618443965911865, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "ref_ce_loss": 0.1021922305226326, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "loss": 0.39164695143699646, + "step": 7460 + }, + { + "ce_loss": 0.1578332930803299, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "distill_loss": 0.14780035614967346, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "ref_ce_loss": 0.08552494645118713, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "loss": 0.42083337903022766, + "step": 7460 + }, + { + "ce_loss": 0.1467936635017395, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "distill_loss": 0.11076997220516205, + "epoch": 2.4883255503669113, + "step": 7460 + }, + { + "epoch": 2.4883255503669113, + "ref_ce_loss": 0.11539274454116821, + "step": 7460 + }, + { + "epoch": 2.4916611074049366, + "loss": 0.6286, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "grad_norm": 4.215702056884766, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "learning_rate": 0.00024188098923593902, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "loss": 0.5800678133964539, + "step": 7470 + }, + { + "ce_loss": 0.16411294043064117, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "distill_loss": 0.23902928829193115, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "ref_ce_loss": 0.11827875673770905, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "loss": 0.3950924873352051, + "step": 7470 + }, + { + "ce_loss": 0.061212923377752304, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "distill_loss": 0.08429202437400818, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "ref_ce_loss": 0.0835026279091835, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "loss": 0.43904346227645874, + "step": 7470 + }, + { + "ce_loss": 0.10477552562952042, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "distill_loss": 0.2018274962902069, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "ref_ce_loss": 0.08816604316234589, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "loss": 0.5673875212669373, + "step": 7470 + }, + { + "ce_loss": 0.18711598217487335, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "distill_loss": 0.19603317975997925, + "epoch": 2.4916611074049366, + "step": 7470 + }, + { + "epoch": 2.4916611074049366, + "ref_ce_loss": 0.15125808119773865, + "step": 7470 + }, + { + "epoch": 2.494996664442962, + "loss": 0.7135, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "grad_norm": 3.397770404815674, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "learning_rate": 0.00024172079292625952, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "loss": 0.4040660262107849, + "step": 7480 + }, + { + "ce_loss": 0.1057736873626709, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "distill_loss": 0.17069004476070404, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "ref_ce_loss": 0.0931447446346283, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "loss": 0.6216812133789062, + "step": 7480 + }, + { + "ce_loss": 0.12649120390415192, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "distill_loss": 0.20638568699359894, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "ref_ce_loss": 0.1636555939912796, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "loss": 0.5215267539024353, + "step": 7480 + }, + { + "ce_loss": 0.1331818848848343, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "distill_loss": 0.21987393498420715, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "ref_ce_loss": 0.13187724351882935, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "loss": 0.6386827230453491, + "step": 7480 + }, + { + "ce_loss": 0.10811474174261093, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "distill_loss": 0.24786339700222015, + "epoch": 2.494996664442962, + "step": 7480 + }, + { + "epoch": 2.494996664442962, + "ref_ce_loss": 0.09294579923152924, + "step": 7480 + }, + { + "epoch": 2.4983322214809873, + "loss": 0.6108, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "grad_norm": 3.2522027492523193, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "learning_rate": 0.00024156042935403462, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "loss": 0.5217353701591492, + "step": 7490 + }, + { + "ce_loss": 0.20177902281284332, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "distill_loss": 0.17955923080444336, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "ref_ce_loss": 0.10753311216831207, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "loss": 0.29052671790122986, + "step": 7490 + }, + { + "ce_loss": 0.07492802292108536, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "distill_loss": 0.13153742253780365, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "ref_ce_loss": 0.08389332890510559, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "loss": 0.9276106357574463, + "step": 7490 + }, + { + "ce_loss": 0.21934948861598969, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "distill_loss": 0.2663779854774475, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "ref_ce_loss": 0.14847496151924133, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "loss": 0.433186411857605, + "step": 7490 + }, + { + "ce_loss": 0.15955831110477448, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "distill_loss": 0.1145581305027008, + "epoch": 2.4983322214809873, + "step": 7490 + }, + { + "epoch": 2.4983322214809873, + "ref_ce_loss": 0.12650629878044128, + "step": 7490 + }, + { + "epoch": 2.5016677785190127, + "loss": 0.539, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "grad_norm": 2.983107805252075, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "learning_rate": 0.0002413998988117042, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "loss": 0.9979486465454102, + "step": 7500 + }, + { + "ce_loss": 0.2776032090187073, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "distill_loss": 0.11760402470827103, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "ref_ce_loss": 0.19638264179229736, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "loss": 0.421108603477478, + "step": 7500 + }, + { + "ce_loss": 0.16717898845672607, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "distill_loss": 0.10004810243844986, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "ref_ce_loss": 0.11813834309577942, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "loss": 0.43692728877067566, + "step": 7500 + }, + { + "ce_loss": 0.22714455425739288, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "distill_loss": 0.12298592180013657, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "ref_ce_loss": 0.08648133277893066, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "loss": 0.8263677358627319, + "step": 7500 + }, + { + "ce_loss": 0.2486896514892578, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "distill_loss": 0.230479896068573, + "epoch": 2.5016677785190127, + "step": 7500 + }, + { + "epoch": 2.5016677785190127, + "ref_ce_loss": 0.14610914885997772, + "step": 7500 + }, + { + "epoch": 2.505003335557038, + "loss": 0.5267, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "grad_norm": 3.099586248397827, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "learning_rate": 0.00024123920159201267, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "loss": 0.4549373984336853, + "step": 7510 + }, + { + "ce_loss": 0.1630726009607315, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "distill_loss": 0.08730512112379074, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "ref_ce_loss": 0.12055899947881699, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "loss": 0.3760097324848175, + "step": 7510 + }, + { + "ce_loss": 0.130742609500885, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "distill_loss": 0.12639303505420685, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "ref_ce_loss": 0.11874870955944061, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "loss": 0.5273082852363586, + "step": 7510 + }, + { + "ce_loss": 0.13833826780319214, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "distill_loss": 0.09497985988855362, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "ref_ce_loss": 0.14981545507907867, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "loss": 0.5824294686317444, + "step": 7510 + }, + { + "ce_loss": 0.11524631083011627, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "distill_loss": 0.09551960229873657, + "epoch": 2.505003335557038, + "step": 7510 + }, + { + "epoch": 2.505003335557038, + "ref_ce_loss": 0.10985547304153442, + "step": 7510 + }, + { + "epoch": 2.5083388925950634, + "loss": 1.6356, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "grad_norm": 36.170310974121094, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "learning_rate": 0.00024107833798800836, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "loss": 4.801138877868652, + "step": 7520 + }, + { + "ce_loss": 3.2041690349578857, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "distill_loss": 0.11274446547031403, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "ref_ce_loss": 1.380598783493042, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "loss": 5.07634162902832, + "step": 7520 + }, + { + "ce_loss": 3.3332977294921875, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "distill_loss": 0.09714693576097488, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "ref_ce_loss": 1.598562240600586, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "loss": 5.017405986785889, + "step": 7520 + }, + { + "ce_loss": 3.367450714111328, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "distill_loss": 0.1504187136888504, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "ref_ce_loss": 1.3405362367630005, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "loss": 5.071421146392822, + "step": 7520 + }, + { + "ce_loss": 3.21246337890625, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "distill_loss": 0.14449955523014069, + "epoch": 2.5083388925950634, + "step": 7520 + }, + { + "epoch": 2.5083388925950634, + "ref_ce_loss": 1.651926875114441, + "step": 7520 + }, + { + "epoch": 2.5116744496330887, + "loss": 2.5051, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "grad_norm": 7.543781757354736, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "learning_rate": 0.00024091730829304303, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "loss": 1.0518018007278442, + "step": 7530 + }, + { + "ce_loss": 0.5136626958847046, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "distill_loss": 0.10988738387823105, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "ref_ce_loss": 0.2461606115102768, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "loss": 0.971845805644989, + "step": 7530 + }, + { + "ce_loss": 0.5943147540092468, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "distill_loss": 0.12562216818332672, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "ref_ce_loss": 0.2518536448478699, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "loss": 1.4921238422393799, + "step": 7530 + }, + { + "ce_loss": 0.5412873029708862, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "distill_loss": 0.11847780644893646, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "ref_ce_loss": 0.322559118270874, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "loss": 0.9531009793281555, + "step": 7530 + }, + { + "ce_loss": 0.5480855107307434, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "distill_loss": 0.10795928537845612, + "epoch": 2.5116744496330887, + "step": 7530 + }, + { + "epoch": 2.5116744496330887, + "ref_ce_loss": 0.2576931118965149, + "step": 7530 + }, + { + "epoch": 2.515010006671114, + "loss": 1.0296, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "grad_norm": 4.333218097686768, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "learning_rate": 0.00024075611280077134, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "loss": 0.8116427659988403, + "step": 7540 + }, + { + "ce_loss": 0.30960413813591003, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "distill_loss": 0.14997927844524384, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "ref_ce_loss": 0.15476161241531372, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "loss": 0.52923983335495, + "step": 7540 + }, + { + "ce_loss": 0.23071542382240295, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "distill_loss": 0.12685328722000122, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "ref_ce_loss": 0.17161113023757935, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "loss": 0.6986463665962219, + "step": 7540 + }, + { + "ce_loss": 0.31322136521339417, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "distill_loss": 0.11960118263959885, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "ref_ce_loss": 0.14314311742782593, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "loss": 0.6826801300048828, + "step": 7540 + }, + { + "ce_loss": 0.3476658761501312, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "distill_loss": 0.11629290878772736, + "epoch": 2.515010006671114, + "step": 7540 + }, + { + "epoch": 2.515010006671114, + "ref_ce_loss": 0.1628488004207611, + "step": 7540 + }, + { + "epoch": 2.5183455637091394, + "loss": 0.7002, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "grad_norm": 3.7075579166412354, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "learning_rate": 0.0002405947518051503, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "loss": 0.43118125200271606, + "step": 7550 + }, + { + "ce_loss": 0.15303654968738556, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "distill_loss": 0.14630776643753052, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "ref_ce_loss": 0.13163325190544128, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "loss": 0.6882593631744385, + "step": 7550 + }, + { + "ce_loss": 0.20254714787006378, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "distill_loss": 0.16186225414276123, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "ref_ce_loss": 0.12776897847652435, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "loss": 0.8160839080810547, + "step": 7550 + }, + { + "ce_loss": 0.24247363209724426, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "distill_loss": 0.20983321964740753, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "ref_ce_loss": 0.1945839673280716, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "loss": 0.5754175186157227, + "step": 7550 + }, + { + "ce_loss": 0.1744595170021057, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "distill_loss": 0.1595647931098938, + "epoch": 2.5183455637091394, + "step": 7550 + }, + { + "epoch": 2.5183455637091394, + "ref_ce_loss": 0.15324027836322784, + "step": 7550 + }, + { + "epoch": 2.5216811207471648, + "loss": 0.5811, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "grad_norm": 3.553694248199463, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "learning_rate": 0.00024043322560043863, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "loss": 0.5153651237487793, + "step": 7560 + }, + { + "ce_loss": 0.1536191999912262, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "distill_loss": 0.09473700821399689, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "ref_ce_loss": 0.1389169543981552, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "loss": 0.4598870277404785, + "step": 7560 + }, + { + "ce_loss": 0.15863952040672302, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "distill_loss": 0.13414156436920166, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "ref_ce_loss": 0.0958477035164833, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "loss": 0.47331902384757996, + "step": 7560 + }, + { + "ce_loss": 0.18015851080417633, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "distill_loss": 0.11007126420736313, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "ref_ce_loss": 0.11271440237760544, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "loss": 0.6730250120162964, + "step": 7560 + }, + { + "ce_loss": 0.20672163367271423, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "distill_loss": 0.13213717937469482, + "epoch": 2.5216811207471648, + "step": 7560 + }, + { + "epoch": 2.5216811207471648, + "ref_ce_loss": 0.1556655317544937, + "step": 7560 + }, + { + "epoch": 2.52501667778519, + "loss": 0.5522, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "grad_norm": 3.1989450454711914, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "learning_rate": 0.00024027153448119646, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "loss": 0.3477581739425659, + "step": 7570 + }, + { + "ce_loss": 0.12741214036941528, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "distill_loss": 0.11304664611816406, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "ref_ce_loss": 0.1070604920387268, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "loss": 0.6823329925537109, + "step": 7570 + }, + { + "ce_loss": 0.2807355523109436, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "distill_loss": 0.11333362013101578, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "ref_ce_loss": 0.20557934045791626, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "loss": 0.6630694270133972, + "step": 7570 + }, + { + "ce_loss": 0.3418087363243103, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "distill_loss": 0.11887340247631073, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "ref_ce_loss": 0.12373942881822586, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "loss": 0.5742785930633545, + "step": 7570 + }, + { + "ce_loss": 0.21951600909233093, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "distill_loss": 0.14838585257530212, + "epoch": 2.52501667778519, + "step": 7570 + }, + { + "epoch": 2.52501667778519, + "ref_ce_loss": 0.16113975644111633, + "step": 7570 + }, + { + "epoch": 2.5283522348232155, + "loss": 0.605, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "grad_norm": 3.2635841369628906, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "learning_rate": 0.0002401096787422846, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "loss": 0.6578416228294373, + "step": 7580 + }, + { + "ce_loss": 0.24671395123004913, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "distill_loss": 0.18206465244293213, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "ref_ce_loss": 0.1655759960412979, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "loss": 0.3640592694282532, + "step": 7580 + }, + { + "ce_loss": 0.09979964792728424, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "distill_loss": 0.07907267659902573, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "ref_ce_loss": 0.09818270802497864, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "loss": 0.4667300879955292, + "step": 7580 + }, + { + "ce_loss": 0.17414849996566772, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "distill_loss": 0.17536064982414246, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "ref_ce_loss": 0.11710204184055328, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "loss": 0.5661674737930298, + "step": 7580 + }, + { + "ce_loss": 0.20824968814849854, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "distill_loss": 0.14784125983715057, + "epoch": 2.5283522348232155, + "step": 7580 + }, + { + "epoch": 2.5283522348232155, + "ref_ce_loss": 0.16290564835071564, + "step": 7580 + }, + { + "epoch": 2.531687791861241, + "loss": 0.6413, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "grad_norm": 6.0181193351745605, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "learning_rate": 0.0002399476586788641, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "loss": 0.512836217880249, + "step": 7590 + }, + { + "ce_loss": 0.1735915094614029, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "distill_loss": 0.1299559324979782, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "ref_ce_loss": 0.13969220221042633, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "loss": 0.43017613887786865, + "step": 7590 + }, + { + "ce_loss": 0.0749160647392273, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "distill_loss": 0.11237940937280655, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "ref_ce_loss": 0.11628800630569458, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "loss": 0.6326050162315369, + "step": 7590 + }, + { + "ce_loss": 0.26504430174827576, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "distill_loss": 0.14532341063022614, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "ref_ce_loss": 0.15772642195224762, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "loss": 0.5163159370422363, + "step": 7590 + }, + { + "ce_loss": 0.21247656643390656, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "distill_loss": 0.1484421044588089, + "epoch": 2.531687791861241, + "step": 7590 + }, + { + "epoch": 2.531687791861241, + "ref_ce_loss": 0.1257714480161667, + "step": 7590 + }, + { + "epoch": 2.535023348899266, + "loss": 0.6002, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "grad_norm": 3.680274724960327, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "learning_rate": 0.00023978547458639566, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "loss": 0.6189171075820923, + "step": 7600 + }, + { + "ce_loss": 0.19142690300941467, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "distill_loss": 0.1640305519104004, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "ref_ce_loss": 0.18159721791744232, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "loss": 0.5700227618217468, + "step": 7600 + }, + { + "ce_loss": 0.18071216344833374, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "distill_loss": 0.13341012597084045, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "ref_ce_loss": 0.12869039177894592, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "loss": 0.6764943599700928, + "step": 7600 + }, + { + "ce_loss": 0.25785961747169495, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "distill_loss": 0.13872337341308594, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "ref_ce_loss": 0.2371172457933426, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "loss": 0.5429132580757141, + "step": 7600 + }, + { + "ce_loss": 0.23566880822181702, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "distill_loss": 0.16433201730251312, + "epoch": 2.535023348899266, + "step": 7600 + }, + { + "epoch": 2.535023348899266, + "ref_ce_loss": 0.14133983850479126, + "step": 7600 + }, + { + "epoch": 2.5383589059372915, + "loss": 0.5731, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "grad_norm": 1.8018347024917603, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "learning_rate": 0.00023962312676063905, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "loss": 0.5834633111953735, + "step": 7610 + }, + { + "ce_loss": 0.18504847586154938, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "distill_loss": 0.1272597312927246, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "ref_ce_loss": 0.16280600428581238, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "loss": 0.4163941740989685, + "step": 7610 + }, + { + "ce_loss": 0.12109852582216263, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "distill_loss": 0.0993369072675705, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "ref_ce_loss": 0.11387766897678375, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "loss": 0.6831787824630737, + "step": 7610 + }, + { + "ce_loss": 0.17585034668445587, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "distill_loss": 0.16444019973278046, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "ref_ce_loss": 0.10961062461137772, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "loss": 0.48016393184661865, + "step": 7610 + }, + { + "ce_loss": 0.14712761342525482, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "distill_loss": 0.14475704729557037, + "epoch": 2.5383589059372915, + "step": 7610 + }, + { + "epoch": 2.5383589059372915, + "ref_ce_loss": 0.14661256968975067, + "step": 7610 + }, + { + "epoch": 2.541694462975317, + "loss": 0.5923, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "grad_norm": 2.1599721908569336, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "learning_rate": 0.0002394606154976526, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "loss": 0.7730429172515869, + "step": 7620 + }, + { + "ce_loss": 0.31173962354660034, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "distill_loss": 0.14953872561454773, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "ref_ce_loss": 0.16186164319515228, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "loss": 0.5969633460044861, + "step": 7620 + }, + { + "ce_loss": 0.22048798203468323, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "distill_loss": 0.143099844455719, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "ref_ce_loss": 0.18054793775081635, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "loss": 0.3590729534626007, + "step": 7620 + }, + { + "ce_loss": 0.12761783599853516, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "distill_loss": 0.11471801996231079, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "ref_ce_loss": 0.08587874472141266, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "loss": 0.5616735816001892, + "step": 7620 + }, + { + "ce_loss": 0.19857123494148254, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "distill_loss": 0.10939596593379974, + "epoch": 2.541694462975317, + "step": 7620 + }, + { + "epoch": 2.541694462975317, + "ref_ce_loss": 0.15047235786914825, + "step": 7620 + }, + { + "epoch": 2.545030020013342, + "loss": 0.5276, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "grad_norm": 3.97973370552063, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "learning_rate": 0.00023929794109379287, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "loss": 0.6210626363754272, + "step": 7630 + }, + { + "ce_loss": 0.17225535213947296, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "distill_loss": 0.1483703851699829, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "ref_ce_loss": 0.1286020129919052, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "loss": 0.5207104682922363, + "step": 7630 + }, + { + "ce_loss": 0.1763487607240677, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "distill_loss": 0.11557600647211075, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "ref_ce_loss": 0.22873103618621826, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "loss": 0.7825448513031006, + "step": 7630 + }, + { + "ce_loss": 0.18735192716121674, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "distill_loss": 0.1284457892179489, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "ref_ce_loss": 0.13140769302845, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "loss": 0.5712705850601196, + "step": 7630 + }, + { + "ce_loss": 0.24999207258224487, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "distill_loss": 0.1476946920156479, + "epoch": 2.545030020013342, + "step": 7630 + }, + { + "epoch": 2.545030020013342, + "ref_ce_loss": 0.14753498136997223, + "step": 7630 + }, + { + "epoch": 2.5483655770513676, + "loss": 0.5822, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "grad_norm": 3.5894253253936768, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "learning_rate": 0.00023913510384571376, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "loss": 0.6420358419418335, + "step": 7640 + }, + { + "ce_loss": 0.23614124953746796, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "distill_loss": 0.11829626560211182, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "ref_ce_loss": 0.15863285958766937, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "loss": 0.31583189964294434, + "step": 7640 + }, + { + "ce_loss": 0.10105039924383163, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "distill_loss": 0.09995920956134796, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "ref_ce_loss": 0.10147218406200409, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "loss": 0.42695561051368713, + "step": 7640 + }, + { + "ce_loss": 0.18332314491271973, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "distill_loss": 0.12776078283786774, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "ref_ce_loss": 0.11573680490255356, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "loss": 0.4857944846153259, + "step": 7640 + }, + { + "ce_loss": 0.15125729143619537, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "distill_loss": 0.10850798338651657, + "epoch": 2.5483655770513676, + "step": 7640 + }, + { + "epoch": 2.5483655770513676, + "ref_ce_loss": 0.11452734470367432, + "step": 7640 + }, + { + "epoch": 2.551701134089393, + "loss": 0.5687, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "grad_norm": 2.613729238510132, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "learning_rate": 0.00023897210405036612, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "loss": 0.2363915741443634, + "step": 7650 + }, + { + "ce_loss": 0.07392606139183044, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "distill_loss": 0.07387639582157135, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "ref_ce_loss": 0.08818396925926208, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "loss": 0.4613707363605499, + "step": 7650 + }, + { + "ce_loss": 0.17740534245967865, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "distill_loss": 0.10959557443857193, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "ref_ce_loss": 0.12329917401075363, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "loss": 0.4941861927509308, + "step": 7650 + }, + { + "ce_loss": 0.1800439953804016, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "distill_loss": 0.11342333257198334, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "ref_ce_loss": 0.15587930381298065, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "loss": 0.5839400291442871, + "step": 7650 + }, + { + "ce_loss": 0.2504223883152008, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "distill_loss": 0.13013054430484772, + "epoch": 2.551701134089393, + "step": 7650 + }, + { + "epoch": 2.551701134089393, + "ref_ce_loss": 0.15354035794734955, + "step": 7650 + }, + { + "epoch": 2.5550366911274183, + "loss": 0.5413, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "grad_norm": 2.720123052597046, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "learning_rate": 0.00023880894200499733, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "loss": 0.5325741767883301, + "step": 7660 + }, + { + "ce_loss": 0.1940358281135559, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "distill_loss": 0.09372791647911072, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "ref_ce_loss": 0.1709611415863037, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "loss": 0.3856140375137329, + "step": 7660 + }, + { + "ce_loss": 0.10936636477708817, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "distill_loss": 0.08666469901800156, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "ref_ce_loss": 0.12398192286491394, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "loss": 0.494606614112854, + "step": 7660 + }, + { + "ce_loss": 0.16153691709041595, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "distill_loss": 0.09463351219892502, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "ref_ce_loss": 0.1084618866443634, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "loss": 0.4320124387741089, + "step": 7660 + }, + { + "ce_loss": 0.18520474433898926, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "distill_loss": 0.1259734332561493, + "epoch": 2.5550366911274183, + "step": 7660 + }, + { + "epoch": 2.5550366911274183, + "ref_ce_loss": 0.12075541168451309, + "step": 7660 + }, + { + "epoch": 2.5583722481654436, + "loss": 0.518, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "grad_norm": 4.928752899169922, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "learning_rate": 0.00023864561800715064, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "loss": 0.419145405292511, + "step": 7670 + }, + { + "ce_loss": 0.09538471698760986, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "distill_loss": 0.10431893914937973, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "ref_ce_loss": 0.07912413030862808, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "loss": 0.42450791597366333, + "step": 7670 + }, + { + "ce_loss": 0.1916923075914383, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "distill_loss": 0.09768582880496979, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "ref_ce_loss": 0.13494430482387543, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "loss": 0.4638766944408417, + "step": 7670 + }, + { + "ce_loss": 0.16185693442821503, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "distill_loss": 0.11444159597158432, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "ref_ce_loss": 0.07675541192293167, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "loss": 0.3174169063568115, + "step": 7670 + }, + { + "ce_loss": 0.12475244700908661, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "distill_loss": 0.09721045196056366, + "epoch": 2.5583722481654436, + "step": 7670 + }, + { + "epoch": 2.5583722481654436, + "ref_ce_loss": 0.09538126736879349, + "step": 7670 + }, + { + "epoch": 2.561707805203469, + "loss": 0.5129, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "grad_norm": 2.8273167610168457, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "learning_rate": 0.00023848213235466446, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "loss": 0.41403496265411377, + "step": 7680 + }, + { + "ce_loss": 0.1246468648314476, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "distill_loss": 0.10440246015787125, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "ref_ce_loss": 0.11642558127641678, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "loss": 0.4793902337551117, + "step": 7680 + }, + { + "ce_loss": 0.20326383411884308, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "distill_loss": 0.1064993366599083, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "ref_ce_loss": 0.16955675184726715, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "loss": 0.5131489038467407, + "step": 7680 + }, + { + "ce_loss": 0.1686738282442093, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "distill_loss": 0.1163623034954071, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "ref_ce_loss": 0.10202865302562714, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "loss": 0.5387371182441711, + "step": 7680 + }, + { + "ce_loss": 0.250522643327713, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "distill_loss": 0.11985168606042862, + "epoch": 2.561707805203469, + "step": 7680 + }, + { + "epoch": 2.561707805203469, + "ref_ce_loss": 0.12084297090768814, + "step": 7680 + }, + { + "epoch": 2.5650433622414943, + "loss": 0.525, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "grad_norm": 3.9550886154174805, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "learning_rate": 0.0002383184853456723, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "loss": 0.35102054476737976, + "step": 7690 + }, + { + "ce_loss": 0.10862553864717484, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "distill_loss": 0.1106899157166481, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "ref_ce_loss": 0.08591561019420624, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "loss": 0.5251890420913696, + "step": 7690 + }, + { + "ce_loss": 0.1936178356409073, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "distill_loss": 0.12350133061408997, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "ref_ce_loss": 0.14425380527973175, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "loss": 0.8277546763420105, + "step": 7690 + }, + { + "ce_loss": 0.2607184648513794, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "distill_loss": 0.10593435913324356, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "ref_ce_loss": 0.16472357511520386, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "loss": 1.047260046005249, + "step": 7690 + }, + { + "ce_loss": 0.3077158033847809, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "distill_loss": 0.14151984453201294, + "epoch": 2.5650433622414943, + "step": 7690 + }, + { + "epoch": 2.5650433622414943, + "ref_ce_loss": 0.12401144206523895, + "step": 7690 + }, + { + "epoch": 2.5683789192795197, + "loss": 0.5345, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "grad_norm": 2.5511090755462646, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "learning_rate": 0.00023815467727860163, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "loss": 0.4556001126766205, + "step": 7700 + }, + { + "ce_loss": 0.10793557018041611, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "distill_loss": 0.09077011793851852, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "ref_ce_loss": 0.1416168063879013, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "loss": 0.3976340591907501, + "step": 7700 + }, + { + "ce_loss": 0.1789923459291458, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "distill_loss": 0.08997838199138641, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "ref_ce_loss": 0.08977121859788895, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "loss": 0.43043792247772217, + "step": 7700 + }, + { + "ce_loss": 0.10186466574668884, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "distill_loss": 0.0814705491065979, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "ref_ce_loss": 0.11523747444152832, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "loss": 0.7596093416213989, + "step": 7700 + }, + { + "ce_loss": 0.3001655638217926, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "distill_loss": 0.13424482941627502, + "epoch": 2.5683789192795197, + "step": 7700 + }, + { + "epoch": 2.5683789192795197, + "ref_ce_loss": 0.19055742025375366, + "step": 7700 + }, + { + "epoch": 2.571714476317545, + "loss": 0.5771, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "grad_norm": 3.4679062366485596, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "learning_rate": 0.00023799070845217381, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "loss": 0.33432021737098694, + "step": 7710 + }, + { + "ce_loss": 0.1009223535656929, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "distill_loss": 0.0869167223572731, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "ref_ce_loss": 0.08801834285259247, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "loss": 0.4419139325618744, + "step": 7710 + }, + { + "ce_loss": 0.1773403137922287, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "distill_loss": 0.12590724229812622, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "ref_ce_loss": 0.137704998254776, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "loss": 0.505905032157898, + "step": 7710 + }, + { + "ce_loss": 0.21214526891708374, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "distill_loss": 0.14007237553596497, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "ref_ce_loss": 0.1139286682009697, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "loss": 0.47575730085372925, + "step": 7710 + }, + { + "ce_loss": 0.11864303797483444, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "distill_loss": 0.12276129424571991, + "epoch": 2.571714476317545, + "step": 7710 + }, + { + "epoch": 2.571714476317545, + "ref_ce_loss": 0.12149628251791, + "step": 7710 + }, + { + "epoch": 2.5750500333555704, + "loss": 0.5578, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "grad_norm": 4.5611138343811035, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "learning_rate": 0.00023782657916540325, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "loss": 0.6805188655853271, + "step": 7720 + }, + { + "ce_loss": 0.2107529491186142, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "distill_loss": 0.12623649835586548, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "ref_ce_loss": 0.146802619099617, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "loss": 0.6122359037399292, + "step": 7720 + }, + { + "ce_loss": 0.23602664470672607, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "distill_loss": 0.1204591616988182, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "ref_ce_loss": 0.17816196382045746, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "loss": 0.5088629722595215, + "step": 7720 + }, + { + "ce_loss": 0.16029979288578033, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "distill_loss": 0.11510752141475677, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "ref_ce_loss": 0.11622966080904007, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "loss": 0.4893638491630554, + "step": 7720 + }, + { + "ce_loss": 0.09988599270582199, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "distill_loss": 0.12076612561941147, + "epoch": 2.5750500333555704, + "step": 7720 + }, + { + "epoch": 2.5750500333555704, + "ref_ce_loss": 0.1032547876238823, + "step": 7720 + }, + { + "epoch": 2.5783855903935957, + "loss": 0.5313, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "grad_norm": 3.259249687194824, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "learning_rate": 0.00023766228971759706, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "loss": 0.4709855914115906, + "step": 7730 + }, + { + "ce_loss": 0.17486350238323212, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "distill_loss": 0.11171098798513412, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "ref_ce_loss": 0.13182541728019714, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "loss": 0.4668689966201782, + "step": 7730 + }, + { + "ce_loss": 0.18540440499782562, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "distill_loss": 0.15899895131587982, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "ref_ce_loss": 0.08990299701690674, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "loss": 0.48005780577659607, + "step": 7730 + }, + { + "ce_loss": 0.21282799541950226, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "distill_loss": 0.15303613245487213, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "ref_ce_loss": 0.09080526977777481, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "loss": 0.6611260175704956, + "step": 7730 + }, + { + "ce_loss": 0.18075811862945557, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "distill_loss": 0.17533832788467407, + "epoch": 2.5783855903935957, + "step": 7730 + }, + { + "epoch": 2.5783855903935957, + "ref_ce_loss": 0.1334233433008194, + "step": 7730 + }, + { + "epoch": 2.581721147431621, + "loss": 0.5627, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "grad_norm": 3.2500455379486084, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "learning_rate": 0.00023749784040835438, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "loss": 0.6345797777175903, + "step": 7740 + }, + { + "ce_loss": 0.21962440013885498, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "distill_loss": 0.10905186831951141, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "ref_ce_loss": 0.14250461757183075, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "loss": 0.34530290961265564, + "step": 7740 + }, + { + "ce_loss": 0.1335408091545105, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "distill_loss": 0.09475085884332657, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "ref_ce_loss": 0.11538613587617874, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "loss": 0.5248610973358154, + "step": 7740 + }, + { + "ce_loss": 0.19531312584877014, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "distill_loss": 0.13128426671028137, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "ref_ce_loss": 0.14382454752922058, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "loss": 0.5610891580581665, + "step": 7740 + }, + { + "ce_loss": 0.18045581877231598, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "distill_loss": 0.11995457112789154, + "epoch": 2.581721147431621, + "step": 7740 + }, + { + "epoch": 2.581721147431621, + "ref_ce_loss": 0.16676151752471924, + "step": 7740 + }, + { + "epoch": 2.5850567044696464, + "loss": 0.5274, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "grad_norm": 2.872438669204712, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "learning_rate": 0.00023733323153756587, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "loss": 0.717305064201355, + "step": 7750 + }, + { + "ce_loss": 0.1562374085187912, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "distill_loss": 0.09478169679641724, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "ref_ce_loss": 0.14355888962745667, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "loss": 0.5134044289588928, + "step": 7750 + }, + { + "ce_loss": 0.18214938044548035, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "distill_loss": 0.12028078734874725, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "ref_ce_loss": 0.12155014276504517, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "loss": 0.6256336569786072, + "step": 7750 + }, + { + "ce_loss": 0.06659332662820816, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "distill_loss": 0.07671971619129181, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "ref_ce_loss": 0.09547872841358185, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "loss": 0.7374293804168701, + "step": 7750 + }, + { + "ce_loss": 0.24218425154685974, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "distill_loss": 0.12191884964704514, + "epoch": 2.5850567044696464, + "step": 7750 + }, + { + "epoch": 2.5850567044696464, + "ref_ce_loss": 0.1148589551448822, + "step": 7750 + }, + { + "epoch": 2.5883922615076718, + "loss": 0.5772, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "grad_norm": 4.004894256591797, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "learning_rate": 0.00023716846340541317, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "loss": 0.6149783134460449, + "step": 7760 + }, + { + "ce_loss": 0.13599584996700287, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "distill_loss": 0.09181113541126251, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "ref_ce_loss": 0.0897497609257698, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "loss": 0.5708010196685791, + "step": 7760 + }, + { + "ce_loss": 0.22451792657375336, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "distill_loss": 0.1451644003391266, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "ref_ce_loss": 0.14718560874462128, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "loss": 0.28180474042892456, + "step": 7760 + }, + { + "ce_loss": 0.07587552815675735, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "distill_loss": 0.08645644783973694, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "ref_ce_loss": 0.08731941133737564, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "loss": 0.5717758536338806, + "step": 7760 + }, + { + "ce_loss": 0.14076487720012665, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "distill_loss": 0.11443852633237839, + "epoch": 2.5883922615076718, + "step": 7760 + }, + { + "epoch": 2.5883922615076718, + "ref_ce_loss": 0.08949923515319824, + "step": 7760 + }, + { + "epoch": 2.591727818545697, + "loss": 0.5554, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "grad_norm": 3.8314449787139893, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "learning_rate": 0.00023700353631236838, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "loss": 0.4185023307800293, + "step": 7770 + }, + { + "ce_loss": 0.15791524946689606, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "distill_loss": 0.12973660230636597, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "ref_ce_loss": 0.13063375651836395, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "loss": 0.610988438129425, + "step": 7770 + }, + { + "ce_loss": 0.20933449268341064, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "distill_loss": 0.10694743692874908, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "ref_ce_loss": 0.15038563311100006, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "loss": 0.43110722303390503, + "step": 7770 + }, + { + "ce_loss": 0.17192485928535461, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "distill_loss": 0.13432137668132782, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "ref_ce_loss": 0.12439984828233719, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "loss": 0.3572756052017212, + "step": 7770 + }, + { + "ce_loss": 0.11541720479726791, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "distill_loss": 0.09471176564693451, + "epoch": 2.591727818545697, + "step": 7770 + }, + { + "epoch": 2.591727818545697, + "ref_ce_loss": 0.07893867790699005, + "step": 7770 + }, + { + "epoch": 2.5950633755837225, + "loss": 0.5271, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "grad_norm": 2.763399839401245, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "learning_rate": 0.00023683845055919348, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "loss": 0.8820345997810364, + "step": 7780 + }, + { + "ce_loss": 0.15303368866443634, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "distill_loss": 0.10918956249952316, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "ref_ce_loss": 0.1406395137310028, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "loss": 0.5381264686584473, + "step": 7780 + }, + { + "ce_loss": 0.09407369047403336, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "distill_loss": 0.09477350115776062, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "ref_ce_loss": 0.10281086713075638, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "loss": 0.5149239301681519, + "step": 7780 + }, + { + "ce_loss": 0.16611416637897491, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "distill_loss": 0.10362285375595093, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "ref_ce_loss": 0.16059048473834991, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "loss": 0.34199070930480957, + "step": 7780 + }, + { + "ce_loss": 0.1551412045955658, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "distill_loss": 0.09238369017839432, + "epoch": 2.5950633755837225, + "step": 7780 + }, + { + "epoch": 2.5950633755837225, + "ref_ce_loss": 0.06881777942180634, + "step": 7780 + }, + { + "epoch": 2.598398932621748, + "loss": 0.5522, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "grad_norm": 5.211056232452393, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "learning_rate": 0.00023667320644693972, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "loss": 0.4736732244491577, + "step": 7790 + }, + { + "ce_loss": 0.16912175714969635, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "distill_loss": 0.13457514345645905, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "ref_ce_loss": 0.10892794281244278, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "loss": 0.7895961999893188, + "step": 7790 + }, + { + "ce_loss": 0.12324633449316025, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "distill_loss": 0.13549686968326569, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "ref_ce_loss": 0.13592009246349335, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "loss": 0.42667579650878906, + "step": 7790 + }, + { + "ce_loss": 0.15414555370807648, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "distill_loss": 0.1340002715587616, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "ref_ce_loss": 0.10248945653438568, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "loss": 0.6181153059005737, + "step": 7790 + }, + { + "ce_loss": 0.20727142691612244, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "distill_loss": 0.1465279459953308, + "epoch": 2.598398932621748, + "step": 7790 + }, + { + "epoch": 2.598398932621748, + "ref_ce_loss": 0.10699298232793808, + "step": 7790 + }, + { + "epoch": 2.601734489659773, + "loss": 0.5686, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "grad_norm": 2.9544246196746826, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "learning_rate": 0.0002365078042769472, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "loss": 0.7977356910705566, + "step": 7800 + }, + { + "ce_loss": 0.14952455461025238, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "distill_loss": 0.13543793559074402, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "ref_ce_loss": 0.13750240206718445, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "loss": 0.5125139355659485, + "step": 7800 + }, + { + "ce_loss": 0.11975100636482239, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "distill_loss": 0.11853191256523132, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "ref_ce_loss": 0.11989688873291016, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "loss": 0.6820304989814758, + "step": 7800 + }, + { + "ce_loss": 0.25920236110687256, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "distill_loss": 0.16221410036087036, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "ref_ce_loss": 0.13963061571121216, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "loss": 0.48601311445236206, + "step": 7800 + }, + { + "ce_loss": 0.16878481209278107, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "distill_loss": 0.12551510334014893, + "epoch": 2.601734489659773, + "step": 7800 + }, + { + "epoch": 2.601734489659773, + "ref_ce_loss": 0.13169850409030914, + "step": 7800 + }, + { + "epoch": 2.6050700466977985, + "loss": 0.5393, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "grad_norm": 3.2493793964385986, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "learning_rate": 0.00023634224435084417, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "loss": 0.6498173475265503, + "step": 7810 + }, + { + "ce_loss": 0.22611533105373383, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "distill_loss": 0.22686871886253357, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "ref_ce_loss": 0.17064939439296722, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "loss": 0.624129056930542, + "step": 7810 + }, + { + "ce_loss": 0.23449084162712097, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "distill_loss": 0.19048425555229187, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "ref_ce_loss": 0.1580566167831421, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "loss": 0.5485934019088745, + "step": 7810 + }, + { + "ce_loss": 0.21335983276367188, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "distill_loss": 0.17096811532974243, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "ref_ce_loss": 0.12088602781295776, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "loss": 0.32192063331604004, + "step": 7810 + }, + { + "ce_loss": 0.08201882988214493, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "distill_loss": 0.15331301093101501, + "epoch": 2.6050700466977985, + "step": 7810 + }, + { + "epoch": 2.6050700466977985, + "ref_ce_loss": 0.08554382622241974, + "step": 7810 + }, + { + "epoch": 2.608405603735824, + "loss": 0.586, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "grad_norm": 3.0376055240631104, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "learning_rate": 0.00023617652697054673, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "loss": 0.6930296421051025, + "step": 7820 + }, + { + "ce_loss": 0.2070154845714569, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "distill_loss": 0.13998956978321075, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "ref_ce_loss": 0.1303083449602127, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "loss": 0.5493748784065247, + "step": 7820 + }, + { + "ce_loss": 0.11979985982179642, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "distill_loss": 0.11463198810815811, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "ref_ce_loss": 0.10523556917905807, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "loss": 0.4238487482070923, + "step": 7820 + }, + { + "ce_loss": 0.1376940906047821, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "distill_loss": 0.1326865404844284, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "ref_ce_loss": 0.12547308206558228, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "loss": 0.639647901058197, + "step": 7820 + }, + { + "ce_loss": 0.30324020981788635, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "distill_loss": 0.11521129310131073, + "epoch": 2.608405603735824, + "step": 7820 + }, + { + "epoch": 2.608405603735824, + "ref_ce_loss": 0.22053390741348267, + "step": 7820 + }, + { + "epoch": 2.611741160773849, + "loss": 0.5546, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "grad_norm": 4.630809783935547, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "learning_rate": 0.00023601065243825795, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "loss": 0.4851466715335846, + "step": 7830 + }, + { + "ce_loss": 0.18493600189685822, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "distill_loss": 0.12290176749229431, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "ref_ce_loss": 0.11735208332538605, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "loss": 0.5059221982955933, + "step": 7830 + }, + { + "ce_loss": 0.21578659117221832, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "distill_loss": 0.17096026241779327, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "ref_ce_loss": 0.11815796047449112, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "loss": 0.5575016140937805, + "step": 7830 + }, + { + "ce_loss": 0.18043434619903564, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "distill_loss": 0.12575297057628632, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "ref_ce_loss": 0.15635573863983154, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "loss": 0.31332072615623474, + "step": 7830 + }, + { + "ce_loss": 0.09760003536939621, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "distill_loss": 0.10402275621891022, + "epoch": 2.611741160773849, + "step": 7830 + }, + { + "epoch": 2.611741160773849, + "ref_ce_loss": 0.11161261796951294, + "step": 7830 + }, + { + "epoch": 2.6150767178118746, + "loss": 0.5233, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "grad_norm": 2.5343832969665527, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "learning_rate": 0.00023584462105646754, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "loss": 0.5742359757423401, + "step": 7840 + }, + { + "ce_loss": 0.22729800641536713, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "distill_loss": 0.1504513919353485, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "ref_ce_loss": 0.14994561672210693, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "loss": 0.5492827892303467, + "step": 7840 + }, + { + "ce_loss": 0.14900290966033936, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "distill_loss": 0.13081350922584534, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "ref_ce_loss": 0.10538212954998016, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "loss": 0.3085349500179291, + "step": 7840 + }, + { + "ce_loss": 0.09896077960729599, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "distill_loss": 0.10900798439979553, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "ref_ce_loss": 0.10040585696697235, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "loss": 0.3425389528274536, + "step": 7840 + }, + { + "ce_loss": 0.1181720420718193, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "distill_loss": 0.0888860896229744, + "epoch": 2.6150767178118746, + "step": 7840 + }, + { + "epoch": 2.6150767178118746, + "ref_ce_loss": 0.1350540816783905, + "step": 7840 + }, + { + "epoch": 2.6184122748499, + "loss": 0.5468, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "grad_norm": 3.092991828918457, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "learning_rate": 0.0002356784331279513, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "loss": 0.3807421922683716, + "step": 7850 + }, + { + "ce_loss": 0.1497032344341278, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "distill_loss": 0.13074158132076263, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "ref_ce_loss": 0.10015086084604263, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "loss": 0.5140164494514465, + "step": 7850 + }, + { + "ce_loss": 0.07548467069864273, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "distill_loss": 0.09762059152126312, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "ref_ce_loss": 0.09445083141326904, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "loss": 0.5912425518035889, + "step": 7850 + }, + { + "ce_loss": 0.15175721049308777, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "distill_loss": 0.1264921873807907, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "ref_ce_loss": 0.15066170692443848, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "loss": 0.5041841864585876, + "step": 7850 + }, + { + "ce_loss": 0.19449378550052643, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "distill_loss": 0.12563063204288483, + "epoch": 2.6184122748499, + "step": 7850 + }, + { + "epoch": 2.6184122748499, + "ref_ce_loss": 0.1535726934671402, + "step": 7850 + }, + { + "epoch": 2.6217478318879253, + "loss": 0.4922, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "grad_norm": 2.572794198989868, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "learning_rate": 0.00023551208895577038, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "loss": 0.3305208086967468, + "step": 7860 + }, + { + "ce_loss": 0.07853977382183075, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "distill_loss": 0.11406944692134857, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "ref_ce_loss": 0.07194974273443222, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "loss": 0.40856409072875977, + "step": 7860 + }, + { + "ce_loss": 0.12992922961711884, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "distill_loss": 0.1046760231256485, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "ref_ce_loss": 0.126336470246315, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "loss": 0.5514043569564819, + "step": 7860 + }, + { + "ce_loss": 0.2061086893081665, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "distill_loss": 0.10851980000734329, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "ref_ce_loss": 0.181810200214386, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "loss": 0.6222173571586609, + "step": 7860 + }, + { + "ce_loss": 0.1734030842781067, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "distill_loss": 0.1488630175590515, + "epoch": 2.6217478318879253, + "step": 7860 + }, + { + "epoch": 2.6217478318879253, + "ref_ce_loss": 0.12546145915985107, + "step": 7860 + }, + { + "epoch": 2.6250833889259506, + "loss": 0.5458, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "grad_norm": 2.7927324771881104, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "learning_rate": 0.000235345588843271, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "loss": 0.32617244124412537, + "step": 7870 + }, + { + "ce_loss": 0.052165593951940536, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "distill_loss": 0.09367629140615463, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "ref_ce_loss": 0.09321639686822891, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "loss": 0.6275125741958618, + "step": 7870 + }, + { + "ce_loss": 0.15996789932250977, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "distill_loss": 0.10561716556549072, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "ref_ce_loss": 0.11221954226493835, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "loss": 0.43381965160369873, + "step": 7870 + }, + { + "ce_loss": 0.1376059651374817, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "distill_loss": 0.1076732650399208, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "ref_ce_loss": 0.15326763689517975, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "loss": 0.4509509801864624, + "step": 7870 + }, + { + "ce_loss": 0.1815311163663864, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "distill_loss": 0.09893777966499329, + "epoch": 2.6250833889259506, + "step": 7870 + }, + { + "epoch": 2.6250833889259506, + "ref_ce_loss": 0.14123815298080444, + "step": 7870 + }, + { + "epoch": 2.628418945963976, + "loss": 0.5622, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "grad_norm": 3.051100492477417, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "learning_rate": 0.0002351789330940836, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "loss": 0.4621231257915497, + "step": 7880 + }, + { + "ce_loss": 0.20068912208080292, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "distill_loss": 0.10784181952476501, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "ref_ce_loss": 0.10929391533136368, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "loss": 0.5110466480255127, + "step": 7880 + }, + { + "ce_loss": 0.14794176816940308, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "distill_loss": 0.08582356572151184, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "ref_ce_loss": 0.09186380356550217, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "loss": 0.31455156207084656, + "step": 7880 + }, + { + "ce_loss": 0.09946168214082718, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "distill_loss": 0.08664387464523315, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "ref_ce_loss": 0.09416986256837845, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "loss": 0.7443742752075195, + "step": 7880 + }, + { + "ce_loss": 0.17656545341014862, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "distill_loss": 0.11620085686445236, + "epoch": 2.628418945963976, + "step": 7880 + }, + { + "epoch": 2.628418945963976, + "ref_ce_loss": 0.09211236983537674, + "step": 7880 + }, + { + "epoch": 2.6317545030020013, + "loss": 0.5989, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "grad_norm": 2.1629552841186523, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "learning_rate": 0.00023501212201212262, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "loss": 0.5096309185028076, + "step": 7890 + }, + { + "ce_loss": 0.09982051700353622, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "distill_loss": 0.11478620767593384, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "ref_ce_loss": 0.14122353494167328, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "loss": 0.608100414276123, + "step": 7890 + }, + { + "ce_loss": 0.20234572887420654, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "distill_loss": 0.13174159824848175, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "ref_ce_loss": 0.14330333471298218, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "loss": 0.5019533634185791, + "step": 7890 + }, + { + "ce_loss": 0.18130330741405487, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "distill_loss": 0.10709169507026672, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "ref_ce_loss": 0.15589384734630585, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "loss": 0.4099520742893219, + "step": 7890 + }, + { + "ce_loss": 0.1622970849275589, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "distill_loss": 0.13596194982528687, + "epoch": 2.6317545030020013, + "step": 7890 + }, + { + "epoch": 2.6317545030020013, + "ref_ce_loss": 0.11146480590105057, + "step": 7890 + }, + { + "epoch": 2.6350900600400267, + "loss": 0.5707, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "grad_norm": 2.643676996231079, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "learning_rate": 0.00023484515590158566, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "loss": 0.37032589316368103, + "step": 7900 + }, + { + "ce_loss": 0.1284305900335312, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "distill_loss": 0.10827270150184631, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "ref_ce_loss": 0.08193215727806091, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "loss": 0.3987683951854706, + "step": 7900 + }, + { + "ce_loss": 0.1566096395254135, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "distill_loss": 0.14688178896903992, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "ref_ce_loss": 0.09464366734027863, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "loss": 0.9638165235519409, + "step": 7900 + }, + { + "ce_loss": 0.27305978536605835, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "distill_loss": 0.18029692769050598, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "ref_ce_loss": 0.15340293943881989, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "loss": 0.4966169595718384, + "step": 7900 + }, + { + "ce_loss": 0.13691580295562744, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "distill_loss": 0.11072009056806564, + "epoch": 2.6350900600400267, + "step": 7900 + }, + { + "epoch": 2.6350900600400267, + "ref_ce_loss": 0.14098653197288513, + "step": 7900 + }, + { + "epoch": 2.638425617078052, + "loss": 0.5957, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "grad_norm": 1.870453953742981, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "learning_rate": 0.00023467803506695305, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "loss": 0.3408009707927704, + "step": 7910 + }, + { + "ce_loss": 0.1164073795080185, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "distill_loss": 0.11458929628133774, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "ref_ce_loss": 0.0619593970477581, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "loss": 0.6032022833824158, + "step": 7910 + }, + { + "ce_loss": 0.13740438222885132, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "distill_loss": 0.18423528969287872, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "ref_ce_loss": 0.15096089243888855, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "loss": 0.3866853713989258, + "step": 7910 + }, + { + "ce_loss": 0.11736882477998734, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "distill_loss": 0.1122303456068039, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "ref_ce_loss": 0.10660815238952637, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "loss": 0.4587579369544983, + "step": 7910 + }, + { + "ce_loss": 0.15818382799625397, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "distill_loss": 0.12847086787223816, + "epoch": 2.638425617078052, + "step": 7910 + }, + { + "epoch": 2.638425617078052, + "ref_ce_loss": 0.11664664000272751, + "step": 7910 + }, + { + "epoch": 2.6417611741160774, + "loss": 0.5464, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "grad_norm": 2.7910468578338623, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "learning_rate": 0.00023451075981298716, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "loss": 0.4135196805000305, + "step": 7920 + }, + { + "ce_loss": 0.162226140499115, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "distill_loss": 0.12755383551120758, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "ref_ce_loss": 0.092817023396492, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "loss": 0.96152663230896, + "step": 7920 + }, + { + "ce_loss": 0.183192178606987, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "distill_loss": 0.17160940170288086, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "ref_ce_loss": 0.09496646374464035, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "loss": 0.5214636921882629, + "step": 7920 + }, + { + "ce_loss": 0.14947235584259033, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "distill_loss": 0.11900770664215088, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "ref_ce_loss": 0.11163744330406189, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "loss": 0.2927347719669342, + "step": 7920 + }, + { + "ce_loss": 0.08152108639478683, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "distill_loss": 0.10856892913579941, + "epoch": 2.6417611741160774, + "step": 7920 + }, + { + "epoch": 2.6417611741160774, + "ref_ce_loss": 0.10249659419059753, + "step": 7920 + }, + { + "epoch": 2.6450967311541027, + "loss": 0.5723, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "grad_norm": 4.5653533935546875, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "learning_rate": 0.00023434333044473215, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "loss": 0.5235077738761902, + "step": 7930 + }, + { + "ce_loss": 0.13343635201454163, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "distill_loss": 0.257224977016449, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "ref_ce_loss": 0.10070498287677765, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "loss": 0.4883321225643158, + "step": 7930 + }, + { + "ce_loss": 0.130856454372406, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "distill_loss": 0.15088599920272827, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "ref_ce_loss": 0.16542506217956543, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "loss": 0.7994807958602905, + "step": 7930 + }, + { + "ce_loss": 0.1741330325603485, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "distill_loss": 0.3023594319820404, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "ref_ce_loss": 0.11508870869874954, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "loss": 0.5890263319015503, + "step": 7930 + }, + { + "ce_loss": 0.17294292151927948, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "distill_loss": 0.20210260152816772, + "epoch": 2.6450967311541027, + "step": 7930 + }, + { + "epoch": 2.6450967311541027, + "ref_ce_loss": 0.15082262456417084, + "step": 7930 + }, + { + "epoch": 2.648432288192128, + "loss": 0.6198, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "grad_norm": 2.13840913772583, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "learning_rate": 0.00023417574726751318, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "loss": 0.7031892538070679, + "step": 7940 + }, + { + "ce_loss": 0.13717004656791687, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "distill_loss": 0.14210714399814606, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "ref_ce_loss": 0.11063428968191147, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "loss": 0.7125575542449951, + "step": 7940 + }, + { + "ce_loss": 0.20230668783187866, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "distill_loss": 0.25111111998558044, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "ref_ce_loss": 0.2036392092704773, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "loss": 0.5634617805480957, + "step": 7940 + }, + { + "ce_loss": 0.20257918536663055, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "distill_loss": 0.17622129619121552, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "ref_ce_loss": 0.13330873847007751, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "loss": 0.7062669396400452, + "step": 7940 + }, + { + "ce_loss": 0.08234570175409317, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "distill_loss": 0.24133436381816864, + "epoch": 2.648432288192128, + "step": 7940 + }, + { + "epoch": 2.648432288192128, + "ref_ce_loss": 0.10071347653865814, + "step": 7940 + }, + { + "epoch": 2.6517678452301534, + "loss": 0.5858, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "grad_norm": 2.677126884460449, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "learning_rate": 0.0002340080105869358, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "loss": 0.5237668752670288, + "step": 7950 + }, + { + "ce_loss": 0.20540541410446167, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "distill_loss": 0.11389851570129395, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "ref_ce_loss": 0.13045738637447357, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "loss": 0.5186854600906372, + "step": 7950 + }, + { + "ce_loss": 0.18487921357154846, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "distill_loss": 0.14122274518013, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "ref_ce_loss": 0.15152312815189362, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "loss": 0.49603432416915894, + "step": 7950 + }, + { + "ce_loss": 0.1827073097229004, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "distill_loss": 0.1372450590133667, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "ref_ce_loss": 0.12451291084289551, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "loss": 0.6272179484367371, + "step": 7950 + }, + { + "ce_loss": 0.27008381485939026, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "distill_loss": 0.1309342086315155, + "epoch": 2.6517678452301534, + "step": 7950 + }, + { + "epoch": 2.6517678452301534, + "ref_ce_loss": 0.1763468235731125, + "step": 7950 + }, + { + "epoch": 2.6551034022681788, + "loss": 0.6062, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "grad_norm": 1.8981552124023438, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "learning_rate": 0.00023384012070888557, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "loss": 0.6313665509223938, + "step": 7960 + }, + { + "ce_loss": 0.12360011041164398, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "distill_loss": 0.1303849071264267, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "ref_ce_loss": 0.13423965871334076, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "loss": 0.4521801173686981, + "step": 7960 + }, + { + "ce_loss": 0.20214208960533142, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "distill_loss": 0.13033844530582428, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "ref_ce_loss": 0.11876943707466125, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "loss": 0.7230217456817627, + "step": 7960 + }, + { + "ce_loss": 0.20204299688339233, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "distill_loss": 0.14705559611320496, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "ref_ce_loss": 0.12485090643167496, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "loss": 0.33352601528167725, + "step": 7960 + }, + { + "ce_loss": 0.08580795675516129, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "distill_loss": 0.08311134576797485, + "epoch": 2.6551034022681788, + "step": 7960 + }, + { + "epoch": 2.6551034022681788, + "ref_ce_loss": 0.0709758847951889, + "step": 7960 + }, + { + "epoch": 2.658438959306204, + "loss": 0.5841, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "grad_norm": 7.362756729125977, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "learning_rate": 0.00023367207793952737, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "loss": 0.38419678807258606, + "step": 7970 + }, + { + "ce_loss": 0.09084519743919373, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "distill_loss": 0.13838332891464233, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "ref_ce_loss": 0.12994177639484406, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "loss": 0.5091409683227539, + "step": 7970 + }, + { + "ce_loss": 0.17732951045036316, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "distill_loss": 0.12083851546049118, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "ref_ce_loss": 0.13549870252609253, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "loss": 0.5226490497589111, + "step": 7970 + }, + { + "ce_loss": 0.17717614769935608, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "distill_loss": 0.131042018532753, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "ref_ce_loss": 0.15869306027889252, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "loss": 0.5586010813713074, + "step": 7970 + }, + { + "ce_loss": 0.16853399574756622, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "distill_loss": 0.21672941744327545, + "epoch": 2.658438959306204, + "step": 7970 + }, + { + "epoch": 2.658438959306204, + "ref_ce_loss": 0.121253103017807, + "step": 7970 + }, + { + "epoch": 2.6617745163442295, + "loss": 0.5451, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "grad_norm": 2.5755162239074707, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "learning_rate": 0.00023350388258530497, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "loss": 0.48963046073913574, + "step": 7980 + }, + { + "ce_loss": 0.16120007634162903, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "distill_loss": 0.20963573455810547, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "ref_ce_loss": 0.118538036942482, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "loss": 0.5607742071151733, + "step": 7980 + }, + { + "ce_loss": 0.12266329675912857, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "distill_loss": 0.13333964347839355, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "ref_ce_loss": 0.10077905654907227, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "loss": 0.364191472530365, + "step": 7980 + }, + { + "ce_loss": 0.07570328563451767, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "distill_loss": 0.08897201716899872, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "ref_ce_loss": 0.08781538903713226, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "loss": 0.5703341364860535, + "step": 7980 + }, + { + "ce_loss": 0.13742899894714355, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "distill_loss": 0.16016842424869537, + "epoch": 2.6617745163442295, + "step": 7980 + }, + { + "epoch": 2.6617745163442295, + "ref_ce_loss": 0.11559545993804932, + "step": 7980 + }, + { + "epoch": 2.665110073382255, + "loss": 0.5638, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "grad_norm": 4.504335880279541, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "learning_rate": 0.0002333355349529403, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "loss": 0.7399834394454956, + "step": 7990 + }, + { + "ce_loss": 0.1619812250137329, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "distill_loss": 0.1331026256084442, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "ref_ce_loss": 0.11494561284780502, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "loss": 0.4661237597465515, + "step": 7990 + }, + { + "ce_loss": 0.07970134913921356, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "distill_loss": 0.11089581996202469, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "ref_ce_loss": 0.09639148414134979, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "loss": 0.8837971091270447, + "step": 7990 + }, + { + "ce_loss": 0.12881000339984894, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "distill_loss": 0.13882938027381897, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "ref_ce_loss": 0.10138195753097534, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "loss": 0.6575859785079956, + "step": 7990 + }, + { + "ce_loss": 0.17015022039413452, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "distill_loss": 0.1299816071987152, + "epoch": 2.665110073382255, + "step": 7990 + }, + { + "epoch": 2.665110073382255, + "ref_ce_loss": 0.11390574276447296, + "step": 7990 + }, + { + "epoch": 2.66844563042028, + "loss": 0.5827, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "grad_norm": 2.684920310974121, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "learning_rate": 0.0002331670353494331, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "loss": 1.0271755456924438, + "step": 8000 + }, + { + "ce_loss": 0.20616409182548523, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "distill_loss": 0.13934825360774994, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "ref_ce_loss": 0.1512710601091385, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "loss": 0.4182243347167969, + "step": 8000 + }, + { + "ce_loss": 0.1766296625137329, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "distill_loss": 0.13832080364227295, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "ref_ce_loss": 0.10316318273544312, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "loss": 0.4399473965167999, + "step": 8000 + }, + { + "ce_loss": 0.17161087691783905, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "distill_loss": 0.1129341796040535, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "ref_ce_loss": 0.10985776782035828, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "loss": 0.4889751076698303, + "step": 8000 + }, + { + "ce_loss": 0.16917948424816132, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "distill_loss": 0.16153065860271454, + "epoch": 2.66844563042028, + "step": 8000 + }, + { + "epoch": 2.66844563042028, + "ref_ce_loss": 0.12116784602403641, + "step": 8000 + }, + { + "epoch": 2.6717811874583055, + "loss": 0.5545, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "grad_norm": 3.7803685665130615, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "learning_rate": 0.00023299838408206015, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "loss": 0.5689238905906677, + "step": 8010 + }, + { + "ce_loss": 0.17813092470169067, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "distill_loss": 0.22044917941093445, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "ref_ce_loss": 0.08347884565591812, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "loss": 0.888495922088623, + "step": 8010 + }, + { + "ce_loss": 0.10351214557886124, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "distill_loss": 0.11769212782382965, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "ref_ce_loss": 0.0803191289305687, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "loss": 0.4115409553050995, + "step": 8010 + }, + { + "ce_loss": 0.1512393355369568, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "distill_loss": 0.126437708735466, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "ref_ce_loss": 0.10095108300447464, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "loss": 1.0289809703826904, + "step": 8010 + }, + { + "ce_loss": 0.2210640162229538, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "distill_loss": 0.14857910573482513, + "epoch": 2.6717811874583055, + "step": 8010 + }, + { + "epoch": 2.6717811874583055, + "ref_ce_loss": 0.12264930456876755, + "step": 8010 + }, + { + "epoch": 2.675116744496331, + "loss": 0.6117, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "grad_norm": 2.5475668907165527, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "learning_rate": 0.00023282958145837477, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "loss": 0.4965721666812897, + "step": 8020 + }, + { + "ce_loss": 0.1637820154428482, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "distill_loss": 0.12568223476409912, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "ref_ce_loss": 0.13360291719436646, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "loss": 0.28336021304130554, + "step": 8020 + }, + { + "ce_loss": 0.07376381009817123, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "distill_loss": 0.1208600327372551, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "ref_ce_loss": 0.08844497054815292, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "loss": 0.5349476933479309, + "step": 8020 + }, + { + "ce_loss": 0.16752538084983826, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "distill_loss": 0.13089683651924133, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "ref_ce_loss": 0.07995325326919556, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "loss": 0.6904729604721069, + "step": 8020 + }, + { + "ce_loss": 0.13420693576335907, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "distill_loss": 0.17605559527873993, + "epoch": 2.675116744496331, + "step": 8020 + }, + { + "epoch": 2.675116744496331, + "ref_ce_loss": 0.08207875490188599, + "step": 8020 + }, + { + "epoch": 2.678452301534356, + "loss": 0.5091, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "grad_norm": 2.528420925140381, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "learning_rate": 0.00023266062778620647, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "loss": 0.4364303946495056, + "step": 8030 + }, + { + "ce_loss": 0.15353445708751678, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "distill_loss": 0.13547055423259735, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "ref_ce_loss": 0.10869955271482468, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "loss": 0.49167340993881226, + "step": 8030 + }, + { + "ce_loss": 0.14930841326713562, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "distill_loss": 0.11774240434169769, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "ref_ce_loss": 0.10742420703172684, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "loss": 0.6432998776435852, + "step": 8030 + }, + { + "ce_loss": 0.2559523284435272, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "distill_loss": 0.14143937826156616, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "ref_ce_loss": 0.18032102286815643, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "loss": 0.44476890563964844, + "step": 8030 + }, + { + "ce_loss": 0.08300906419754028, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "distill_loss": 0.11552564054727554, + "epoch": 2.678452301534356, + "step": 8030 + }, + { + "epoch": 2.678452301534356, + "ref_ce_loss": 0.07628441601991653, + "step": 8030 + }, + { + "epoch": 2.6817878585723816, + "loss": 0.5758, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "grad_norm": 2.6837093830108643, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "learning_rate": 0.00023249152337366, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "loss": 0.6911327838897705, + "step": 8040 + }, + { + "ce_loss": 0.08783482015132904, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "distill_loss": 0.1832849681377411, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "ref_ce_loss": 0.1677989512681961, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "loss": 0.508490800857544, + "step": 8040 + }, + { + "ce_loss": 0.1933222860097885, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "distill_loss": 0.1114601269364357, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "ref_ce_loss": 0.15833140909671783, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "loss": 0.3330030143260956, + "step": 8040 + }, + { + "ce_loss": 0.11011120676994324, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "distill_loss": 0.1153588593006134, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "ref_ce_loss": 0.10705897212028503, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "loss": 0.6220952868461609, + "step": 8040 + }, + { + "ce_loss": 0.2026032656431198, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "distill_loss": 0.12767234444618225, + "epoch": 2.6817878585723816, + "step": 8040 + }, + { + "epoch": 2.6817878585723816, + "ref_ce_loss": 0.13173550367355347, + "step": 8040 + }, + { + "epoch": 2.685123415610407, + "loss": 0.489, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "grad_norm": 3.3889288902282715, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "learning_rate": 0.0002323222685291152, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "loss": 0.6236046552658081, + "step": 8050 + }, + { + "ce_loss": 0.1862117052078247, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "distill_loss": 0.15372738242149353, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "ref_ce_loss": 0.11308329552412033, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "loss": 0.8643714189529419, + "step": 8050 + }, + { + "ce_loss": 0.29614153504371643, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "distill_loss": 0.1857837736606598, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "ref_ce_loss": 0.15275658667087555, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "loss": 0.40307721495628357, + "step": 8050 + }, + { + "ce_loss": 0.12835247814655304, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "distill_loss": 0.13174714148044586, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "ref_ce_loss": 0.11546413600444794, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "loss": 1.036539077758789, + "step": 8050 + }, + { + "ce_loss": 0.22426684200763702, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "distill_loss": 0.15441244840621948, + "epoch": 2.685123415610407, + "step": 8050 + }, + { + "epoch": 2.685123415610407, + "ref_ce_loss": 0.15819288790225983, + "step": 8050 + }, + { + "epoch": 2.6884589726484323, + "loss": 0.5733, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "grad_norm": 3.0571844577789307, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "learning_rate": 0.000232152863561226, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "loss": 0.4697049856185913, + "step": 8060 + }, + { + "ce_loss": 0.1788596659898758, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "distill_loss": 0.12807488441467285, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "ref_ce_loss": 0.08363891392946243, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "loss": 0.38888436555862427, + "step": 8060 + }, + { + "ce_loss": 0.04981951415538788, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "distill_loss": 0.10500740259885788, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "ref_ce_loss": 0.08457271754741669, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "loss": 0.5909742712974548, + "step": 8060 + }, + { + "ce_loss": 0.20369437336921692, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "distill_loss": 0.16423441469669342, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "ref_ce_loss": 0.10368082672357559, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "loss": 0.6657927632331848, + "step": 8060 + }, + { + "ce_loss": 0.27945613861083984, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "distill_loss": 0.1881733387708664, + "epoch": 2.6884589726484323, + "step": 8060 + }, + { + "epoch": 2.6884589726484323, + "ref_ce_loss": 0.1429987996816635, + "step": 8060 + }, + { + "epoch": 2.6917945296864576, + "loss": 0.5357, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "grad_norm": 5.027276039123535, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "learning_rate": 0.0002319833087789204, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "loss": 0.39156532287597656, + "step": 8070 + }, + { + "ce_loss": 0.12410548329353333, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "distill_loss": 0.10390803962945938, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "ref_ce_loss": 0.10742595791816711, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "loss": 0.3632297217845917, + "step": 8070 + }, + { + "ce_loss": 0.12438895553350449, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "distill_loss": 0.11003535240888596, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "ref_ce_loss": 0.08501872420310974, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "loss": 0.5452225208282471, + "step": 8070 + }, + { + "ce_loss": 0.1348339021205902, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "distill_loss": 0.1079145073890686, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "ref_ce_loss": 0.12742206454277039, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "loss": 0.5933875441551208, + "step": 8070 + }, + { + "ce_loss": 0.13612274825572968, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "distill_loss": 0.1913990080356598, + "epoch": 2.6917945296864576, + "step": 8070 + }, + { + "epoch": 2.6917945296864576, + "ref_ce_loss": 0.08002374321222305, + "step": 8070 + }, + { + "epoch": 2.695130086724483, + "loss": 0.559, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "grad_norm": 2.6058237552642822, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "learning_rate": 0.00023181360449139936, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "loss": 0.5928658246994019, + "step": 8080 + }, + { + "ce_loss": 0.16962631046772003, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "distill_loss": 0.1335127204656601, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "ref_ce_loss": 0.10307545959949493, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "loss": 0.31053030490875244, + "step": 8080 + }, + { + "ce_loss": 0.1039867177605629, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "distill_loss": 0.09887486696243286, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "ref_ce_loss": 0.07038954645395279, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "loss": 0.4417012631893158, + "step": 8080 + }, + { + "ce_loss": 0.10867048054933548, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "distill_loss": 0.11061249673366547, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "ref_ce_loss": 0.1626960039138794, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "loss": 0.5297577977180481, + "step": 8080 + }, + { + "ce_loss": 0.13766439259052277, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "distill_loss": 0.1209607645869255, + "epoch": 2.695130086724483, + "step": 8080 + }, + { + "epoch": 2.695130086724483, + "ref_ce_loss": 0.09336847811937332, + "step": 8080 + }, + { + "epoch": 2.6984656437625083, + "loss": 0.5166, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "grad_norm": 4.001850605010986, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "learning_rate": 0.00023164375100813656, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "loss": 0.5130573511123657, + "step": 8090 + }, + { + "ce_loss": 0.09124461561441422, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "distill_loss": 0.1190020740032196, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "ref_ce_loss": 0.0816359743475914, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "loss": 0.5037546157836914, + "step": 8090 + }, + { + "ce_loss": 0.16275212168693542, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "distill_loss": 0.11124013364315033, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "ref_ce_loss": 0.11581555753946304, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "loss": 0.37682679295539856, + "step": 8090 + }, + { + "ce_loss": 0.09656783938407898, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "distill_loss": 0.12740042805671692, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "ref_ce_loss": 0.08813125640153885, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "loss": 0.41288870573043823, + "step": 8090 + }, + { + "ce_loss": 0.09926676005125046, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "distill_loss": 0.13379068672657013, + "epoch": 2.6984656437625083, + "step": 8090 + }, + { + "epoch": 2.6984656437625083, + "ref_ce_loss": 0.08153785765171051, + "step": 8090 + }, + { + "epoch": 2.7018012008005337, + "loss": 0.5768, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "grad_norm": 3.228931427001953, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "learning_rate": 0.00023147374863887772, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "loss": 0.4492276906967163, + "step": 8100 + }, + { + "ce_loss": 0.056972142308950424, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "distill_loss": 0.11906185001134872, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "ref_ce_loss": 0.09090115875005722, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "loss": 0.3158268332481384, + "step": 8100 + }, + { + "ce_loss": 0.07716682553291321, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "distill_loss": 0.128421813249588, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "ref_ce_loss": 0.11011645942926407, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "loss": 0.7365747690200806, + "step": 8100 + }, + { + "ce_loss": 0.17575332522392273, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "distill_loss": 0.1499844342470169, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "ref_ce_loss": 0.12130838632583618, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "loss": 0.629986047744751, + "step": 8100 + }, + { + "ce_loss": 0.17869074642658234, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "distill_loss": 0.18918591737747192, + "epoch": 2.7018012008005337, + "step": 8100 + }, + { + "epoch": 2.7018012008005337, + "ref_ce_loss": 0.12489745765924454, + "step": 8100 + }, + { + "epoch": 2.705136757838559, + "loss": 0.5244, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "grad_norm": 1.915163278579712, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "learning_rate": 0.00023130359769364016, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "loss": 0.3695540726184845, + "step": 8110 + }, + { + "ce_loss": 0.1474255472421646, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "distill_loss": 0.12527930736541748, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "ref_ce_loss": 0.07345467805862427, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "loss": 0.45366865396499634, + "step": 8110 + }, + { + "ce_loss": 0.1327705830335617, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "distill_loss": 0.11853218078613281, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "ref_ce_loss": 0.142308309674263, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "loss": 0.7480238080024719, + "step": 8110 + }, + { + "ce_loss": 0.17574192583560944, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "distill_loss": 0.17148953676223755, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "ref_ce_loss": 0.1413096785545349, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "loss": 0.5942070484161377, + "step": 8110 + }, + { + "ce_loss": 0.18245285749435425, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "distill_loss": 0.12027274817228317, + "epoch": 2.705136757838559, + "step": 8110 + }, + { + "epoch": 2.705136757838559, + "ref_ce_loss": 0.11873797327280045, + "step": 8110 + }, + { + "epoch": 2.7084723148765844, + "loss": 0.5295, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "grad_norm": 2.6392195224761963, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "learning_rate": 0.00023113329848271203, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "loss": 0.46645209193229675, + "step": 8120 + }, + { + "ce_loss": 0.18638621270656586, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "distill_loss": 0.12224185466766357, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "ref_ce_loss": 0.11621517688035965, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "loss": 0.47254177927970886, + "step": 8120 + }, + { + "ce_loss": 0.20265349745750427, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "distill_loss": 0.12762364745140076, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "ref_ce_loss": 0.10792994499206543, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "loss": 0.7535987496376038, + "step": 8120 + }, + { + "ce_loss": 0.2545333504676819, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "distill_loss": 0.17720168828964233, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "ref_ce_loss": 0.12154602259397507, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "loss": 0.5472800731658936, + "step": 8120 + }, + { + "ce_loss": 0.24012410640716553, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "distill_loss": 0.14520911872386932, + "epoch": 2.7084723148765844, + "step": 8120 + }, + { + "epoch": 2.7084723148765844, + "ref_ce_loss": 0.09373628348112106, + "step": 8120 + }, + { + "epoch": 2.7118078719146097, + "loss": 0.5461, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "grad_norm": 2.5390615463256836, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "learning_rate": 0.00023096285131665197, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "loss": 0.4835377335548401, + "step": 8130 + }, + { + "ce_loss": 0.12601253390312195, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "distill_loss": 0.10771089792251587, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "ref_ce_loss": 0.09932011365890503, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "loss": 0.2281503975391388, + "step": 8130 + }, + { + "ce_loss": 0.04772244021296501, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "distill_loss": 0.08223338425159454, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "ref_ce_loss": 0.06510313600301743, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "loss": 0.4231301546096802, + "step": 8130 + }, + { + "ce_loss": 0.1172071024775505, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "distill_loss": 0.08207279443740845, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "ref_ce_loss": 0.07831268012523651, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "loss": 0.4807983934879303, + "step": 8130 + }, + { + "ce_loss": 0.17932242155075073, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "distill_loss": 0.0898590013384819, + "epoch": 2.7118078719146097, + "step": 8130 + }, + { + "epoch": 2.7118078719146097, + "ref_ce_loss": 0.12309905141592026, + "step": 8130 + }, + { + "epoch": 2.715143428952635, + "loss": 0.4989, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "grad_norm": 2.704934597015381, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "learning_rate": 0.00023079225650628836, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "loss": 0.6727844476699829, + "step": 8140 + }, + { + "ce_loss": 0.223181813955307, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "distill_loss": 0.15185865759849548, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "ref_ce_loss": 0.11520495265722275, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "loss": 0.46417367458343506, + "step": 8140 + }, + { + "ce_loss": 0.14673767983913422, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "distill_loss": 0.12357361614704132, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "ref_ce_loss": 0.10955695062875748, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "loss": 0.6096020936965942, + "step": 8140 + }, + { + "ce_loss": 0.19598866999149323, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "distill_loss": 0.1323723942041397, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "ref_ce_loss": 0.15653467178344727, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "loss": 0.37559083104133606, + "step": 8140 + }, + { + "ce_loss": 0.12198042124509811, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "distill_loss": 0.11107443273067474, + "epoch": 2.715143428952635, + "step": 8140 + }, + { + "epoch": 2.715143428952635, + "ref_ce_loss": 0.14184345304965973, + "step": 8140 + }, + { + "epoch": 2.7184789859906604, + "loss": 0.4915, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "grad_norm": 2.9659900665283203, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "learning_rate": 0.00023062151436271876, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "loss": 0.5116956830024719, + "step": 8150 + }, + { + "ce_loss": 0.21129418909549713, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "distill_loss": 0.11995405703783035, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "ref_ce_loss": 0.12191393226385117, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "loss": 0.4112318754196167, + "step": 8150 + }, + { + "ce_loss": 0.16436515748500824, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "distill_loss": 0.11617519706487656, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "ref_ce_loss": 0.0827714130282402, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "loss": 0.32239362597465515, + "step": 8150 + }, + { + "ce_loss": 0.08988597244024277, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "distill_loss": 0.11604972928762436, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "ref_ce_loss": 0.08602083474397659, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "loss": 0.2883583903312683, + "step": 8150 + }, + { + "ce_loss": 0.03574028238654137, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "distill_loss": 0.0718480795621872, + "epoch": 2.7184789859906604, + "step": 8150 + }, + { + "epoch": 2.7184789859906604, + "ref_ce_loss": 0.06966337561607361, + "step": 8150 + }, + { + "epoch": 2.7218145430286858, + "loss": 0.5195, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "grad_norm": 2.757476329803467, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "learning_rate": 0.0002304506251973096, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "loss": 0.42667993903160095, + "step": 8160 + }, + { + "ce_loss": 0.15945284068584442, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "distill_loss": 0.15174272656440735, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "ref_ce_loss": 0.1153591200709343, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "loss": 0.47556787729263306, + "step": 8160 + }, + { + "ce_loss": 0.11233219504356384, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "distill_loss": 0.12334011495113373, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "ref_ce_loss": 0.12544937431812286, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "loss": 0.5468754172325134, + "step": 8160 + }, + { + "ce_loss": 0.11044180393218994, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "distill_loss": 0.10472964495420456, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "ref_ce_loss": 0.15179766714572906, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "loss": 0.4966447949409485, + "step": 8160 + }, + { + "ce_loss": 0.035888370126485825, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "distill_loss": 0.08420006185770035, + "epoch": 2.7218145430286858, + "step": 8160 + }, + { + "epoch": 2.7218145430286858, + "ref_ce_loss": 0.06462906301021576, + "step": 8160 + }, + { + "epoch": 2.725150100066711, + "loss": 0.5431, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "grad_norm": 2.061748504638672, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "learning_rate": 0.0002302795893216953, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "loss": 0.3870605230331421, + "step": 8170 + }, + { + "ce_loss": 0.14253829419612885, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "distill_loss": 0.11652734875679016, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "ref_ce_loss": 0.08488757163286209, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "loss": 0.49011707305908203, + "step": 8170 + }, + { + "ce_loss": 0.15634660422801971, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "distill_loss": 0.16235539317131042, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "ref_ce_loss": 0.13577237725257874, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "loss": 0.3121011555194855, + "step": 8170 + }, + { + "ce_loss": 0.11659318208694458, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "distill_loss": 0.10463510453701019, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "ref_ce_loss": 0.09063059836626053, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "loss": 0.46912479400634766, + "step": 8170 + }, + { + "ce_loss": 0.13982565701007843, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "distill_loss": 0.14655548334121704, + "epoch": 2.725150100066711, + "step": 8170 + }, + { + "epoch": 2.725150100066711, + "ref_ce_loss": 0.0921240821480751, + "step": 8170 + }, + { + "epoch": 2.7284856571047365, + "loss": 0.5177, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "grad_norm": 2.698974132537842, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "learning_rate": 0.00023010840704777773, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "loss": 0.3805660307407379, + "step": 8180 + }, + { + "ce_loss": 0.14469732344150543, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "distill_loss": 0.1307322382926941, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "ref_ce_loss": 0.10495664924383163, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "loss": 0.4237486720085144, + "step": 8180 + }, + { + "ce_loss": 0.11302044242620468, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "distill_loss": 0.18344856798648834, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "ref_ce_loss": 0.10202555358409882, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "loss": 0.9929975271224976, + "step": 8180 + }, + { + "ce_loss": 0.14125335216522217, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "distill_loss": 0.16645291447639465, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "ref_ce_loss": 0.12964120507240295, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "loss": 0.36327147483825684, + "step": 8180 + }, + { + "ce_loss": 0.07714872062206268, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "distill_loss": 0.09731505066156387, + "epoch": 2.7284856571047365, + "step": 8180 + }, + { + "epoch": 2.7284856571047365, + "ref_ce_loss": 0.07459522783756256, + "step": 8180 + }, + { + "epoch": 2.731821214142762, + "loss": 0.5482, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "grad_norm": 3.1269516944885254, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "learning_rate": 0.0002299370786877259, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "loss": 0.31505829095840454, + "step": 8190 + }, + { + "ce_loss": 0.0993107482790947, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "distill_loss": 0.08881866931915283, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "ref_ce_loss": 0.08592408150434494, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "loss": 0.518826961517334, + "step": 8190 + }, + { + "ce_loss": 0.20128180086612701, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "distill_loss": 0.11471576988697052, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "ref_ce_loss": 0.11057179421186447, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "loss": 0.7147469520568848, + "step": 8190 + }, + { + "ce_loss": 0.2774195969104767, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "distill_loss": 0.14111217856407166, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "ref_ce_loss": 0.15037333965301514, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "loss": 0.5206362009048462, + "step": 8190 + }, + { + "ce_loss": 0.12582939863204956, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "distill_loss": 0.11385433375835419, + "epoch": 2.731821214142762, + "step": 8190 + }, + { + "epoch": 2.731821214142762, + "ref_ce_loss": 0.090955950319767, + "step": 8190 + }, + { + "epoch": 2.735156771180787, + "loss": 0.5921, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "grad_norm": 3.280219793319702, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "learning_rate": 0.00022976560455397518, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "loss": 0.5218157172203064, + "step": 8200 + }, + { + "ce_loss": 0.10749214887619019, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "distill_loss": 0.10494686663150787, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "ref_ce_loss": 0.12046821415424347, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "loss": 0.35924088954925537, + "step": 8200 + }, + { + "ce_loss": 0.13240492343902588, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "distill_loss": 0.10641516745090485, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "ref_ce_loss": 0.12035661935806274, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "loss": 0.5238202214241028, + "step": 8200 + }, + { + "ce_loss": 0.24740082025527954, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "distill_loss": 0.10919897258281708, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "ref_ce_loss": 0.13922780752182007, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "loss": 0.6976079940795898, + "step": 8200 + }, + { + "ce_loss": 0.11878570169210434, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "distill_loss": 0.10893769562244415, + "epoch": 2.735156771180787, + "step": 8200 + }, + { + "epoch": 2.735156771180787, + "ref_ce_loss": 0.059214212000370026, + "step": 8200 + }, + { + "epoch": 2.7384923282188125, + "loss": 0.6152, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "grad_norm": 3.968153953552246, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "learning_rate": 0.00022959398495922667, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "loss": 0.5968283414840698, + "step": 8210 + }, + { + "ce_loss": 0.24610240757465363, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "distill_loss": 0.18845278024673462, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "ref_ce_loss": 0.16213765740394592, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "loss": 0.609846830368042, + "step": 8210 + }, + { + "ce_loss": 0.21443380415439606, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "distill_loss": 0.1477302461862564, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "ref_ce_loss": 0.12607622146606445, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "loss": 0.6559703946113586, + "step": 8210 + }, + { + "ce_loss": 0.15517359972000122, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "distill_loss": 0.1313042789697647, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "ref_ce_loss": 0.14865265786647797, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "loss": 0.42476436495780945, + "step": 8210 + }, + { + "ce_loss": 0.11427508294582367, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "distill_loss": 0.11243674904108047, + "epoch": 2.7384923282188125, + "step": 8210 + }, + { + "epoch": 2.7384923282188125, + "ref_ce_loss": 0.07395554333925247, + "step": 8210 + }, + { + "epoch": 2.741827885256838, + "loss": 0.5614, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "grad_norm": 2.2540650367736816, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "learning_rate": 0.00022942222021644693, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "loss": 0.3177460730075836, + "step": 8220 + }, + { + "ce_loss": 0.09816362708806992, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "distill_loss": 0.1109558641910553, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "ref_ce_loss": 0.10840912908315659, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "loss": 0.3551897406578064, + "step": 8220 + }, + { + "ce_loss": 0.14152449369430542, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "distill_loss": 0.11100196838378906, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "ref_ce_loss": 0.07071300595998764, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "loss": 0.7158865332603455, + "step": 8220 + }, + { + "ce_loss": 0.23116154968738556, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "distill_loss": 0.132449209690094, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "ref_ce_loss": 0.1193322092294693, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "loss": 0.9159722328186035, + "step": 8220 + }, + { + "ce_loss": 0.08125410228967667, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "distill_loss": 0.10034830868244171, + "epoch": 2.741827885256838, + "step": 8220 + }, + { + "epoch": 2.741827885256838, + "ref_ce_loss": 0.10371002554893494, + "step": 8220 + }, + { + "epoch": 2.745163442294863, + "loss": 0.5366, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "grad_norm": 2.331559181213379, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "learning_rate": 0.00022925031063886694, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "loss": 0.437788724899292, + "step": 8230 + }, + { + "ce_loss": 0.16893485188484192, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "distill_loss": 0.12921595573425293, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "ref_ce_loss": 0.10241150856018066, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "loss": 0.30514878034591675, + "step": 8230 + }, + { + "ce_loss": 0.12337757647037506, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "distill_loss": 0.10096579790115356, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "ref_ce_loss": 0.07967543601989746, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "loss": 0.35450685024261475, + "step": 8230 + }, + { + "ce_loss": 0.14865685999393463, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "distill_loss": 0.11411884427070618, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "ref_ce_loss": 0.09148301184177399, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "loss": 0.4512288570404053, + "step": 8230 + }, + { + "ce_loss": 0.1382657140493393, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "distill_loss": 0.10865326970815659, + "epoch": 2.745163442294863, + "step": 8230 + }, + { + "epoch": 2.745163442294863, + "ref_ce_loss": 0.13190177083015442, + "step": 8230 + }, + { + "epoch": 2.7484989993328885, + "loss": 0.5102, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "grad_norm": 2.7061686515808105, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "learning_rate": 0.00022907825653998212, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "loss": 0.5048376321792603, + "step": 8240 + }, + { + "ce_loss": 0.12719407677650452, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "distill_loss": 0.1403333842754364, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "ref_ce_loss": 0.16057303547859192, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "loss": 0.5376629829406738, + "step": 8240 + }, + { + "ce_loss": 0.10417266190052032, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "distill_loss": 0.11442543566226959, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "ref_ce_loss": 0.07191190123558044, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "loss": 0.37093302607536316, + "step": 8240 + }, + { + "ce_loss": 0.10794583708047867, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "distill_loss": 0.10681041330099106, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "ref_ce_loss": 0.1053520068526268, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "loss": 0.42045828700065613, + "step": 8240 + }, + { + "ce_loss": 0.14314033091068268, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "distill_loss": 0.15592116117477417, + "epoch": 2.7484989993328885, + "step": 8240 + }, + { + "epoch": 2.7484989993328885, + "ref_ce_loss": 0.09708252549171448, + "step": 8240 + }, + { + "epoch": 2.751834556370914, + "loss": 0.5688, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "grad_norm": 4.964454650878906, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "learning_rate": 0.00022890605823355117, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "loss": 0.48830074071884155, + "step": 8250 + }, + { + "ce_loss": 0.1908160150051117, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "distill_loss": 0.14335381984710693, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "ref_ce_loss": 0.15347762405872345, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "loss": 0.34461137652397156, + "step": 8250 + }, + { + "ce_loss": 0.11253272742033005, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "distill_loss": 0.10419323295354843, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "ref_ce_loss": 0.07173644006252289, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "loss": 0.48704829812049866, + "step": 8250 + }, + { + "ce_loss": 0.22168782353401184, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "distill_loss": 0.11275321990251541, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "ref_ce_loss": 0.12069100886583328, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "loss": 0.253851979970932, + "step": 8250 + }, + { + "ce_loss": 0.08856004476547241, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "distill_loss": 0.09050433337688446, + "epoch": 2.751834556370914, + "step": 8250 + }, + { + "epoch": 2.751834556370914, + "ref_ce_loss": 0.07416380196809769, + "step": 8250 + }, + { + "epoch": 2.7551701134089392, + "loss": 0.509, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "grad_norm": 2.1522469520568848, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "learning_rate": 0.00022873371603359587, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "loss": 0.3628401756286621, + "step": 8260 + }, + { + "ce_loss": 0.06882809102535248, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "distill_loss": 0.11147104203701019, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "ref_ce_loss": 0.1081618219614029, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "loss": 0.35818222165107727, + "step": 8260 + }, + { + "ce_loss": 0.09174513816833496, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "distill_loss": 0.11023850739002228, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "ref_ce_loss": 0.10617993026971817, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "loss": 0.5432717800140381, + "step": 8260 + }, + { + "ce_loss": 0.19231824576854706, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "distill_loss": 0.15208996832370758, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "ref_ce_loss": 0.12890948355197906, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "loss": 0.41913700103759766, + "step": 8260 + }, + { + "ce_loss": 0.14565613865852356, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "distill_loss": 0.11267106235027313, + "epoch": 2.7551701134089392, + "step": 8260 + }, + { + "epoch": 2.7551701134089392, + "ref_ce_loss": 0.11973301321268082, + "step": 8260 + }, + { + "epoch": 2.7585056704469646, + "loss": 0.501, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "grad_norm": 2.99421763420105, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "learning_rate": 0.00022856123025440046, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "loss": 0.9480876326560974, + "step": 8270 + }, + { + "ce_loss": 0.17917704582214355, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "distill_loss": 0.16749121248722076, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "ref_ce_loss": 0.15291234850883484, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "loss": 0.5085225105285645, + "step": 8270 + }, + { + "ce_loss": 0.19998572766780853, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "distill_loss": 0.15323159098625183, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "ref_ce_loss": 0.15509232878684998, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "loss": 0.640496015548706, + "step": 8270 + }, + { + "ce_loss": 0.19523316621780396, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "distill_loss": 0.17683634161949158, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "ref_ce_loss": 0.13698916137218475, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "loss": 0.47830960154533386, + "step": 8270 + }, + { + "ce_loss": 0.12321220338344574, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "distill_loss": 0.1424240916967392, + "epoch": 2.7585056704469646, + "step": 8270 + }, + { + "epoch": 2.7585056704469646, + "ref_ce_loss": 0.17582586407661438, + "step": 8270 + }, + { + "epoch": 2.76184122748499, + "loss": 0.587, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "grad_norm": 2.557399272918701, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "learning_rate": 0.00022838860121051098, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "loss": 0.5317304134368896, + "step": 8280 + }, + { + "ce_loss": 0.12961804866790771, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "distill_loss": 0.1425638645887375, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "ref_ce_loss": 0.10886183381080627, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "loss": 0.6594823002815247, + "step": 8280 + }, + { + "ce_loss": 0.13466475903987885, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "distill_loss": 0.1664295643568039, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "ref_ce_loss": 0.10923914611339569, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "loss": 0.3929463028907776, + "step": 8280 + }, + { + "ce_loss": 0.117137610912323, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "distill_loss": 0.13499777019023895, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "ref_ce_loss": 0.09361618757247925, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "loss": 0.4497656226158142, + "step": 8280 + }, + { + "ce_loss": 0.11880599707365036, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "distill_loss": 0.1401321291923523, + "epoch": 2.76184122748499, + "step": 8280 + }, + { + "epoch": 2.76184122748499, + "ref_ce_loss": 0.1344946324825287, + "step": 8280 + }, + { + "epoch": 2.7651767845230153, + "loss": 0.5636, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "grad_norm": 2.3959028720855713, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "learning_rate": 0.0002282158292167346, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "loss": 0.3913949728012085, + "step": 8290 + }, + { + "ce_loss": 0.08157963305711746, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "distill_loss": 0.14165902137756348, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "ref_ce_loss": 0.11059600859880447, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "loss": 0.6704709529876709, + "step": 8290 + }, + { + "ce_loss": 0.20668728649616241, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "distill_loss": 0.18045960366725922, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "ref_ce_loss": 0.10442976653575897, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "loss": 0.5117354393005371, + "step": 8290 + }, + { + "ce_loss": 0.12252218276262283, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "distill_loss": 0.16555725038051605, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "ref_ce_loss": 0.07605114579200745, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "loss": 0.4941989481449127, + "step": 8290 + }, + { + "ce_loss": 0.14770467579364777, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "distill_loss": 0.15609993040561676, + "epoch": 2.7651767845230153, + "step": 8290 + }, + { + "epoch": 2.7651767845230153, + "ref_ce_loss": 0.09984522312879562, + "step": 8290 + }, + { + "epoch": 2.7685123415610406, + "loss": 0.5792, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "grad_norm": 2.6024227142333984, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "learning_rate": 0.0002280429145881394, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "loss": 0.3573533296585083, + "step": 8300 + }, + { + "ce_loss": 0.10002897679805756, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "distill_loss": 0.1016823798418045, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "ref_ce_loss": 0.1555853933095932, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "loss": 0.35432833433151245, + "step": 8300 + }, + { + "ce_loss": 0.15280495584011078, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "distill_loss": 0.10303471982479095, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "ref_ce_loss": 0.07455757260322571, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "loss": 0.5166060924530029, + "step": 8300 + }, + { + "ce_loss": 0.19417652487754822, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "distill_loss": 0.1766192615032196, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "ref_ce_loss": 0.11216067522764206, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "loss": 0.3201092779636383, + "step": 8300 + }, + { + "ce_loss": 0.11336085945367813, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "distill_loss": 0.09292483329772949, + "epoch": 2.7685123415610406, + "step": 8300 + }, + { + "epoch": 2.7685123415610406, + "ref_ce_loss": 0.09047871828079224, + "step": 8300 + }, + { + "epoch": 2.771847898599066, + "loss": 0.4947, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "grad_norm": 2.7137691974639893, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "learning_rate": 0.00022786985764005344, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "loss": 0.7143236398696899, + "step": 8310 + }, + { + "ce_loss": 0.26414355635643005, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "distill_loss": 0.1972856968641281, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "ref_ce_loss": 0.21074679493904114, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "loss": 0.3790608048439026, + "step": 8310 + }, + { + "ce_loss": 0.10393885523080826, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "distill_loss": 0.14341627061367035, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "ref_ce_loss": 0.08679898828268051, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "loss": 0.6864500045776367, + "step": 8310 + }, + { + "ce_loss": 0.23026344180107117, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "distill_loss": 0.23133046925067902, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "ref_ce_loss": 0.13559125363826752, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "loss": 0.5029471516609192, + "step": 8310 + }, + { + "ce_loss": 0.045588862150907516, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "distill_loss": 0.11035850644111633, + "epoch": 2.771847898599066, + "step": 8310 + }, + { + "epoch": 2.771847898599066, + "ref_ce_loss": 0.05270203575491905, + "step": 8310 + }, + { + "epoch": 2.7751834556370913, + "loss": 0.6118, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "grad_norm": 2.5758280754089355, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "learning_rate": 0.0002276966586880642, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "loss": 0.5190055966377258, + "step": 8320 + }, + { + "ce_loss": 0.17583709955215454, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "distill_loss": 0.1923707276582718, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "ref_ce_loss": 0.15066786110401154, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "loss": 0.43477120995521545, + "step": 8320 + }, + { + "ce_loss": 0.12779302895069122, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "distill_loss": 0.1646592915058136, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "ref_ce_loss": 0.14228765666484833, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "loss": 0.5125862956047058, + "step": 8320 + }, + { + "ce_loss": 0.22525674104690552, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "distill_loss": 0.15953534841537476, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "ref_ce_loss": 0.12673068046569824, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "loss": 0.47959643602371216, + "step": 8320 + }, + { + "ce_loss": 0.14553089439868927, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "distill_loss": 0.15800155699253082, + "epoch": 2.7751834556370913, + "step": 8320 + }, + { + "epoch": 2.7751834556370913, + "ref_ce_loss": 0.07305330038070679, + "step": 8320 + }, + { + "epoch": 2.7785190126751167, + "loss": 0.6519, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "grad_norm": 31.961301803588867, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "learning_rate": 0.00022752331804801843, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "loss": 0.3530633747577667, + "step": 8330 + }, + { + "ce_loss": 0.1247534304857254, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "distill_loss": 0.0954834520816803, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "ref_ce_loss": 0.13278992474079132, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "loss": 0.5396177172660828, + "step": 8330 + }, + { + "ce_loss": 0.233219712972641, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "distill_loss": 0.11943552643060684, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "ref_ce_loss": 0.14556007087230682, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "loss": 0.814037561416626, + "step": 8330 + }, + { + "ce_loss": 0.34219565987586975, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "distill_loss": 0.1418711394071579, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "ref_ce_loss": 0.17148993909358978, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "loss": 0.6103036403656006, + "step": 8330 + }, + { + "ce_loss": 0.2459089308977127, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "distill_loss": 0.12533476948738098, + "epoch": 2.7785190126751167, + "step": 8330 + }, + { + "epoch": 2.7785190126751167, + "ref_ce_loss": 0.11649178713560104, + "step": 8330 + }, + { + "epoch": 2.781854569713142, + "loss": 0.5584, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "grad_norm": 2.948256492614746, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "learning_rate": 0.000227349836036021, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "loss": 0.8184687495231628, + "step": 8340 + }, + { + "ce_loss": 0.15261517465114594, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "distill_loss": 0.15436916053295135, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "ref_ce_loss": 0.08771957457065582, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "loss": 0.32824552059173584, + "step": 8340 + }, + { + "ce_loss": 0.13067448139190674, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "distill_loss": 0.1223086267709732, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "ref_ce_loss": 0.07519922405481339, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "loss": 0.4334113299846649, + "step": 8340 + }, + { + "ce_loss": 0.1789129078388214, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "distill_loss": 0.1433398723602295, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "ref_ce_loss": 0.08752244710922241, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "loss": 0.585452139377594, + "step": 8340 + }, + { + "ce_loss": 0.2676447629928589, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "distill_loss": 0.1586453765630722, + "epoch": 2.781854569713142, + "step": 8340 + }, + { + "epoch": 2.781854569713142, + "ref_ce_loss": 0.10227011144161224, + "step": 8340 + }, + { + "epoch": 2.7851901267511674, + "loss": 0.5426, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "grad_norm": 3.5256121158599854, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "learning_rate": 0.0002271762129684346, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "loss": 0.5500671863555908, + "step": 8350 + }, + { + "ce_loss": 0.1510569006204605, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "distill_loss": 0.12426368147134781, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "ref_ce_loss": 0.177829310297966, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "loss": 0.4813498258590698, + "step": 8350 + }, + { + "ce_loss": 0.15770815312862396, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "distill_loss": 0.11759153753519058, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "ref_ce_loss": 0.10133638978004456, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "loss": 0.6662505865097046, + "step": 8350 + }, + { + "ce_loss": 0.11393112689256668, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "distill_loss": 0.10494688153266907, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "ref_ce_loss": 0.08218897879123688, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "loss": 0.5542718172073364, + "step": 8350 + }, + { + "ce_loss": 0.15418872237205505, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "distill_loss": 0.1185954362154007, + "epoch": 2.7851901267511674, + "step": 8350 + }, + { + "epoch": 2.7851901267511674, + "ref_ce_loss": 0.16383036971092224, + "step": 8350 + }, + { + "epoch": 2.7885256837891927, + "loss": 0.5317, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "grad_norm": 2.987302303314209, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "learning_rate": 0.00022700244916187934, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "loss": 0.5641549229621887, + "step": 8360 + }, + { + "ce_loss": 0.1565759927034378, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "distill_loss": 0.14667509496212006, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "ref_ce_loss": 0.15718014538288116, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "loss": 0.2849089503288269, + "step": 8360 + }, + { + "ce_loss": 0.07774198800325394, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "distill_loss": 0.09604795277118683, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "ref_ce_loss": 0.05384528264403343, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "loss": 0.5093336701393127, + "step": 8360 + }, + { + "ce_loss": 0.20802956819534302, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "distill_loss": 0.15040580928325653, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "ref_ce_loss": 0.15086127817630768, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "loss": 0.6868488788604736, + "step": 8360 + }, + { + "ce_loss": 0.10189005732536316, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "distill_loss": 0.142466738820076, + "epoch": 2.7885256837891927, + "step": 8360 + }, + { + "epoch": 2.7885256837891927, + "ref_ce_loss": 0.13379935920238495, + "step": 8360 + }, + { + "epoch": 2.791861240827218, + "loss": 0.5346, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "grad_norm": 3.1927130222320557, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "learning_rate": 0.0002268285449332317, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "loss": 0.9847922921180725, + "step": 8370 + }, + { + "ce_loss": 0.12005869299173355, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "distill_loss": 0.1328408420085907, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "ref_ce_loss": 0.1106012612581253, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "loss": 0.3865183889865875, + "step": 8370 + }, + { + "ce_loss": 0.16827566921710968, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "distill_loss": 0.10213863849639893, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "ref_ce_loss": 0.11604784429073334, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "loss": 0.7096348404884338, + "step": 8370 + }, + { + "ce_loss": 0.13443057239055634, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "distill_loss": 0.11132686585187912, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "ref_ce_loss": 0.10445878654718399, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "loss": 0.6106353998184204, + "step": 8370 + }, + { + "ce_loss": 0.25030606985092163, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "distill_loss": 0.1418341100215912, + "epoch": 2.791861240827218, + "step": 8370 + }, + { + "epoch": 2.791861240827218, + "ref_ce_loss": 0.1690225601196289, + "step": 8370 + }, + { + "epoch": 2.7951967978652434, + "loss": 0.566, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "grad_norm": 2.1361160278320312, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "learning_rate": 0.00022665450059962457, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "loss": 0.547217845916748, + "step": 8380 + }, + { + "ce_loss": 0.13093866407871246, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "distill_loss": 0.13123691082000732, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "ref_ce_loss": 0.18233349919319153, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "loss": 0.47471001744270325, + "step": 8380 + }, + { + "ce_loss": 0.1193649023771286, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "distill_loss": 0.13772958517074585, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "ref_ce_loss": 0.14166118204593658, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "loss": 0.5118974447250366, + "step": 8380 + }, + { + "ce_loss": 0.19769543409347534, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "distill_loss": 0.109315425157547, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "ref_ce_loss": 0.13785843551158905, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "loss": 0.27766746282577515, + "step": 8380 + }, + { + "ce_loss": 0.06863567233085632, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "distill_loss": 0.09863778203725815, + "epoch": 2.7951967978652434, + "step": 8380 + }, + { + "epoch": 2.7951967978652434, + "ref_ce_loss": 0.11031028628349304, + "step": 8380 + }, + { + "epoch": 2.798532354903269, + "loss": 0.5368, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "grad_norm": 3.499161958694458, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "learning_rate": 0.00022648031647844606, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "loss": 0.5296912789344788, + "step": 8390 + }, + { + "ce_loss": 0.12095022201538086, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "distill_loss": 0.27068638801574707, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "ref_ce_loss": 0.053802739828825, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "loss": 0.5279688239097595, + "step": 8390 + }, + { + "ce_loss": 0.17168818414211273, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "distill_loss": 0.1919613629579544, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "ref_ce_loss": 0.14083920419216156, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "loss": 0.5897954106330872, + "step": 8390 + }, + { + "ce_loss": 0.16287176311016083, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "distill_loss": 0.24030818045139313, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "ref_ce_loss": 0.10864845663309097, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "loss": 0.5427618622779846, + "step": 8390 + }, + { + "ce_loss": 0.16201457381248474, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "distill_loss": 0.28070592880249023, + "epoch": 2.798532354903269, + "step": 8390 + }, + { + "epoch": 2.798532354903269, + "ref_ce_loss": 0.09987376630306244, + "step": 8390 + }, + { + "epoch": 2.801867911941294, + "loss": 0.562, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "grad_norm": 2.6044695377349854, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "learning_rate": 0.0002263059928873393, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "loss": 0.5035500526428223, + "step": 8400 + }, + { + "ce_loss": 0.14720116555690765, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "distill_loss": 0.1462814062833786, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "ref_ce_loss": 0.10444889962673187, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "loss": 0.4154942035675049, + "step": 8400 + }, + { + "ce_loss": 0.09291025251150131, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "distill_loss": 0.12177328765392303, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "ref_ce_loss": 0.11366147547960281, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "loss": 0.7797332406044006, + "step": 8400 + }, + { + "ce_loss": 0.2732406556606293, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "distill_loss": 0.3229169547557831, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "ref_ce_loss": 0.18208087980747223, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "loss": 0.37686821818351746, + "step": 8400 + }, + { + "ce_loss": 0.10438577085733414, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "distill_loss": 0.19975148141384125, + "epoch": 2.801867911941294, + "step": 8400 + }, + { + "epoch": 2.801867911941294, + "ref_ce_loss": 0.07263541966676712, + "step": 8400 + }, + { + "epoch": 2.8052034689793195, + "loss": 0.5641, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "grad_norm": 3.1431491374969482, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "learning_rate": 0.0002261315301442018, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "loss": 0.316400945186615, + "step": 8410 + }, + { + "ce_loss": 0.0983375534415245, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "distill_loss": 0.11273324489593506, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "ref_ce_loss": 0.10528012365102768, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "loss": 0.3205793797969818, + "step": 8410 + }, + { + "ce_loss": 0.07206525653600693, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "distill_loss": 0.11558610200881958, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "ref_ce_loss": 0.1002664789557457, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "loss": 0.44273141026496887, + "step": 8410 + }, + { + "ce_loss": 0.14200718700885773, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "distill_loss": 0.16369619965553284, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "ref_ce_loss": 0.13701307773590088, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "loss": 0.3176065683364868, + "step": 8410 + }, + { + "ce_loss": 0.09353161603212357, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "distill_loss": 0.12242200970649719, + "epoch": 2.8052034689793195, + "step": 8410 + }, + { + "epoch": 2.8052034689793195, + "ref_ce_loss": 0.07452066242694855, + "step": 8410 + }, + { + "epoch": 2.808539026017345, + "loss": 0.5706, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "grad_norm": 4.882355213165283, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "learning_rate": 0.00022595692856718474, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "loss": 0.5670602321624756, + "step": 8420 + }, + { + "ce_loss": 0.15604117512702942, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "distill_loss": 0.15158408880233765, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "ref_ce_loss": 0.15006743371486664, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "loss": 0.43875157833099365, + "step": 8420 + }, + { + "ce_loss": 0.10357891023159027, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "distill_loss": 0.09662986546754837, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "ref_ce_loss": 0.12920770049095154, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "loss": 0.723873496055603, + "step": 8420 + }, + { + "ce_loss": 0.15056408941745758, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "distill_loss": 0.12403174489736557, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "ref_ce_loss": 0.11592728644609451, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "loss": 0.34784793853759766, + "step": 8420 + }, + { + "ce_loss": 0.12006805092096329, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "distill_loss": 0.07333897799253464, + "epoch": 2.808539026017345, + "step": 8420 + }, + { + "epoch": 2.808539026017345, + "ref_ce_loss": 0.07565838098526001, + "step": 8420 + }, + { + "epoch": 2.81187458305537, + "loss": 0.5974, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "grad_norm": 6.5593581199646, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "learning_rate": 0.00022578218847469253, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "loss": 0.38125699758529663, + "step": 8430 + }, + { + "ce_loss": 0.10962362587451935, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "distill_loss": 0.09133778512477875, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "ref_ce_loss": 0.08129521459341049, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "loss": 0.5570087432861328, + "step": 8430 + }, + { + "ce_loss": 0.2145906537771225, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "distill_loss": 0.14080560207366943, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "ref_ce_loss": 0.10355406254529953, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "loss": 0.3802940547466278, + "step": 8430 + }, + { + "ce_loss": 0.11159483343362808, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "distill_loss": 0.07547979056835175, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "ref_ce_loss": 0.08214235305786133, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "loss": 0.29137495160102844, + "step": 8430 + }, + { + "ce_loss": 0.11853925883769989, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "distill_loss": 0.10666890442371368, + "epoch": 2.81187458305537, + "step": 8430 + }, + { + "epoch": 2.81187458305537, + "ref_ce_loss": 0.06610703468322754, + "step": 8430 + }, + { + "epoch": 2.8152101400933955, + "loss": 0.4931, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "grad_norm": 2.802873134613037, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "learning_rate": 0.00022560731018538222, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "loss": 0.25064149498939514, + "step": 8440 + }, + { + "ce_loss": 0.08421101421117783, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "distill_loss": 0.09479227662086487, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "ref_ce_loss": 0.05247608572244644, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "loss": 0.532885730266571, + "step": 8440 + }, + { + "ce_loss": 0.2541835904121399, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "distill_loss": 0.16259463131427765, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "ref_ce_loss": 0.09999735653400421, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "loss": 0.40857672691345215, + "step": 8440 + }, + { + "ce_loss": 0.15303872525691986, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "distill_loss": 0.1471904218196869, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "ref_ce_loss": 0.1081681102514267, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "loss": 0.6053186058998108, + "step": 8440 + }, + { + "ce_loss": 0.23108021914958954, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "distill_loss": 0.21530982851982117, + "epoch": 2.8152101400933955, + "step": 8440 + }, + { + "epoch": 2.8152101400933955, + "ref_ce_loss": 0.11973507702350616, + "step": 8440 + }, + { + "epoch": 2.818545697131421, + "loss": 0.5707, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "grad_norm": 5.992776870727539, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "learning_rate": 0.00022543229401816275, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "loss": 0.5136671662330627, + "step": 8450 + }, + { + "ce_loss": 0.1235622763633728, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "distill_loss": 0.12064239382743835, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "ref_ce_loss": 0.09468799829483032, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "loss": 0.8985491991043091, + "step": 8450 + }, + { + "ce_loss": 0.1961958259344101, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "distill_loss": 0.18243150413036346, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "ref_ce_loss": 0.10728978365659714, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "loss": 0.5325456857681274, + "step": 8450 + }, + { + "ce_loss": 0.11725273728370667, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "distill_loss": 0.12861433625221252, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "ref_ce_loss": 0.11166153848171234, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "loss": 0.5713732838630676, + "step": 8450 + }, + { + "ce_loss": 0.22279508411884308, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "distill_loss": 0.176978200674057, + "epoch": 2.818545697131421, + "step": 8450 + }, + { + "epoch": 2.818545697131421, + "ref_ce_loss": 0.11572866886854172, + "step": 8450 + }, + { + "epoch": 2.8218812541694462, + "loss": 0.5003, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "grad_norm": 1.9595544338226318, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "learning_rate": 0.00022525714029219453, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "loss": 0.421317994594574, + "step": 8460 + }, + { + "ce_loss": 0.1695428192615509, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "distill_loss": 0.11726689338684082, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "ref_ce_loss": 0.13403357565402985, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "loss": 0.5824626684188843, + "step": 8460 + }, + { + "ce_loss": 0.2222665250301361, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "distill_loss": 0.10591816902160645, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "ref_ce_loss": 0.11349005252122879, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "loss": 0.5425556302070618, + "step": 8460 + }, + { + "ce_loss": 0.18343253433704376, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "distill_loss": 0.14774461090564728, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "ref_ce_loss": 0.11161774396896362, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "loss": 0.5546457767486572, + "step": 8460 + }, + { + "ce_loss": 0.1968740075826645, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "distill_loss": 0.15189194679260254, + "epoch": 2.8218812541694462, + "step": 8460 + }, + { + "epoch": 2.8218812541694462, + "ref_ce_loss": 0.14472328126430511, + "step": 8460 + }, + { + "epoch": 2.8252168112074716, + "loss": 0.5254, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "grad_norm": 2.391209602355957, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "learning_rate": 0.00022508184932688903, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "loss": 0.9489675760269165, + "step": 8470 + }, + { + "ce_loss": 0.13908937573432922, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "distill_loss": 0.1329590082168579, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "ref_ce_loss": 0.1170150637626648, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "loss": 0.6241788864135742, + "step": 8470 + }, + { + "ce_loss": 0.19099032878875732, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "distill_loss": 0.10044612735509872, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "ref_ce_loss": 0.17403073608875275, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "loss": 0.4744298458099365, + "step": 8470 + }, + { + "ce_loss": 0.1622345745563507, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "distill_loss": 0.11944203078746796, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "ref_ce_loss": 0.14652585983276367, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "loss": 0.38154834508895874, + "step": 8470 + }, + { + "ce_loss": 0.13473011553287506, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "distill_loss": 0.11211343854665756, + "epoch": 2.8252168112074716, + "step": 8470 + }, + { + "epoch": 2.8252168112074716, + "ref_ce_loss": 0.10368572175502777, + "step": 8470 + }, + { + "epoch": 2.828552368245497, + "loss": 0.5224, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "grad_norm": 3.044171094894409, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "learning_rate": 0.00022490642144190774, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "loss": 0.5300574898719788, + "step": 8480 + }, + { + "ce_loss": 0.18093335628509521, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "distill_loss": 0.14212843775749207, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "ref_ce_loss": 0.09574255347251892, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "loss": 0.9047300815582275, + "step": 8480 + }, + { + "ce_loss": 0.23037096858024597, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "distill_loss": 0.1440839022397995, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "ref_ce_loss": 0.13643290102481842, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "loss": 0.5250870585441589, + "step": 8480 + }, + { + "ce_loss": 0.10960116982460022, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "distill_loss": 0.17659179866313934, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "ref_ce_loss": 0.18103836476802826, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "loss": 0.376802921295166, + "step": 8480 + }, + { + "ce_loss": 0.15248258411884308, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "distill_loss": 0.10913614183664322, + "epoch": 2.828552368245497, + "step": 8480 + }, + { + "epoch": 2.828552368245497, + "ref_ce_loss": 0.11468908935785294, + "step": 8480 + }, + { + "epoch": 2.8318879252835223, + "loss": 0.5157, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "grad_norm": 2.604387044906616, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "learning_rate": 0.00022473085695716183, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "loss": 0.3220893144607544, + "step": 8490 + }, + { + "ce_loss": 0.09508085250854492, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "distill_loss": 0.1276741623878479, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "ref_ce_loss": 0.09848184138536453, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "loss": 0.732323408126831, + "step": 8490 + }, + { + "ce_loss": 0.1797773838043213, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "distill_loss": 0.12821955978870392, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "ref_ce_loss": 0.1704844832420349, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "loss": 0.641586422920227, + "step": 8490 + }, + { + "ce_loss": 0.15142354369163513, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "distill_loss": 0.14353042840957642, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "ref_ce_loss": 0.12409044802188873, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "loss": 0.5020027756690979, + "step": 8490 + }, + { + "ce_loss": 0.1881706863641739, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "distill_loss": 0.2028343379497528, + "epoch": 2.8318879252835223, + "step": 8490 + }, + { + "epoch": 2.8318879252835223, + "ref_ce_loss": 0.11063137650489807, + "step": 8490 + }, + { + "epoch": 2.8352234823215476, + "loss": 0.5539, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "grad_norm": 3.075287342071533, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "learning_rate": 0.0002245551561928118, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "loss": 0.5149841904640198, + "step": 8500 + }, + { + "ce_loss": 0.18708448112010956, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "distill_loss": 0.18625451624393463, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "ref_ce_loss": 0.10302000492811203, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "loss": 0.5088472962379456, + "step": 8500 + }, + { + "ce_loss": 0.18305392563343048, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "distill_loss": 0.12284497916698456, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "ref_ce_loss": 0.14784277975559235, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "loss": 0.6096242070198059, + "step": 8500 + }, + { + "ce_loss": 0.18820351362228394, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "distill_loss": 0.1708454191684723, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "ref_ce_loss": 0.11443863064050674, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "loss": 0.5093251466751099, + "step": 8500 + }, + { + "ce_loss": 0.23517775535583496, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "distill_loss": 0.1514825075864792, + "epoch": 2.8352234823215476, + "step": 8500 + }, + { + "epoch": 2.8352234823215476, + "ref_ce_loss": 0.12234192341566086, + "step": 8500 + }, + { + "epoch": 2.838559039359573, + "loss": 0.556, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "grad_norm": 2.9502902030944824, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "learning_rate": 0.00022437931946926647, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "loss": 0.49666211009025574, + "step": 8510 + }, + { + "ce_loss": 0.17453722655773163, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "distill_loss": 0.18285943567752838, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "ref_ce_loss": 0.13843190670013428, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "loss": 0.6453526616096497, + "step": 8510 + }, + { + "ce_loss": 0.23797675967216492, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "distill_loss": 0.1395404189825058, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "ref_ce_loss": 0.1344192624092102, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "loss": 0.5082492232322693, + "step": 8510 + }, + { + "ce_loss": 0.1746096909046173, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "distill_loss": 0.17531262338161469, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "ref_ce_loss": 0.13140884041786194, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "loss": 0.758216381072998, + "step": 8510 + }, + { + "ce_loss": 0.20714110136032104, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "distill_loss": 0.17534372210502625, + "epoch": 2.838559039359573, + "step": 8510 + }, + { + "epoch": 2.838559039359573, + "ref_ce_loss": 0.16164039075374603, + "step": 8510 + }, + { + "epoch": 2.8418945963975983, + "loss": 0.558, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "grad_norm": 2.721283435821533, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "learning_rate": 0.00022420334710718267, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "loss": 0.607196569442749, + "step": 8520 + }, + { + "ce_loss": 0.21133388578891754, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "distill_loss": 0.12246505171060562, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "ref_ce_loss": 0.16589756309986115, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "loss": 0.4256008565425873, + "step": 8520 + }, + { + "ce_loss": 0.12766288220882416, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "distill_loss": 0.16494876146316528, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "ref_ce_loss": 0.09781324863433838, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "loss": 0.6566895246505737, + "step": 8520 + }, + { + "ce_loss": 0.15638019144535065, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "distill_loss": 0.16774064302444458, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "ref_ce_loss": 0.11066441237926483, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "loss": 0.48018428683280945, + "step": 8520 + }, + { + "ce_loss": 0.19729620218276978, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "distill_loss": 0.16175724565982819, + "epoch": 2.8418945963975983, + "step": 8520 + }, + { + "epoch": 2.8418945963975983, + "ref_ce_loss": 0.11984910815954208, + "step": 8520 + }, + { + "epoch": 2.8452301534356237, + "loss": 0.5809, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "grad_norm": 2.210233688354492, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "learning_rate": 0.00022402723942746466, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "loss": 0.5266987085342407, + "step": 8530 + }, + { + "ce_loss": 0.12296690046787262, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "distill_loss": 0.15192703902721405, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "ref_ce_loss": 0.14122067391872406, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "loss": 0.7991921901702881, + "step": 8530 + }, + { + "ce_loss": 0.13369446992874146, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "distill_loss": 0.17547550797462463, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "ref_ce_loss": 0.15770867466926575, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "loss": 0.5874632596969604, + "step": 8530 + }, + { + "ce_loss": 0.21728350222110748, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "distill_loss": 0.1586538851261139, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "ref_ce_loss": 0.153802290558815, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "loss": 0.47910526394844055, + "step": 8530 + }, + { + "ce_loss": 0.15307433903217316, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "distill_loss": 0.1651879996061325, + "epoch": 2.8452301534356237, + "step": 8530 + }, + { + "epoch": 2.8452301534356237, + "ref_ce_loss": 0.11388645321130753, + "step": 8530 + }, + { + "epoch": 2.848565710473649, + "loss": 0.5447, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "grad_norm": 2.7099955081939697, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "learning_rate": 0.0002238509967512632, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "loss": 0.5581467151641846, + "step": 8540 + }, + { + "ce_loss": 0.18522784113883972, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "distill_loss": 0.16879862546920776, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "ref_ce_loss": 0.14357973635196686, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "loss": 0.6665406227111816, + "step": 8540 + }, + { + "ce_loss": 0.25393146276474, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "distill_loss": 0.14920659363269806, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "ref_ce_loss": 0.18525826930999756, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "loss": 0.5111413598060608, + "step": 8540 + }, + { + "ce_loss": 0.1705455332994461, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "distill_loss": 0.14063940942287445, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "ref_ce_loss": 0.09902207553386688, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "loss": 0.350668728351593, + "step": 8540 + }, + { + "ce_loss": 0.14593736827373505, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "distill_loss": 0.09514002501964569, + "epoch": 2.848565710473649, + "step": 8540 + }, + { + "epoch": 2.848565710473649, + "ref_ce_loss": 0.10897145420312881, + "step": 8540 + }, + { + "epoch": 2.8519012675116744, + "loss": 0.5035, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "grad_norm": 2.573655128479004, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "learning_rate": 0.00022367461939997552, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "loss": 0.5167698264122009, + "step": 8550 + }, + { + "ce_loss": 0.13213999569416046, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "distill_loss": 0.14889588952064514, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "ref_ce_loss": 0.12083609402179718, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "loss": 0.5313799977302551, + "step": 8550 + }, + { + "ce_loss": 0.19686628878116608, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "distill_loss": 0.14351768791675568, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "ref_ce_loss": 0.14347802102565765, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "loss": 0.9321268796920776, + "step": 8550 + }, + { + "ce_loss": 0.16296499967575073, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "distill_loss": 0.14142106473445892, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "ref_ce_loss": 0.10340377688407898, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "loss": 0.4632156491279602, + "step": 8550 + }, + { + "ce_loss": 0.18383440375328064, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "distill_loss": 0.1084541380405426, + "epoch": 2.8519012675116744, + "step": 8550 + }, + { + "epoch": 2.8519012675116744, + "ref_ce_loss": 0.10253027826547623, + "step": 8550 + }, + { + "epoch": 2.8552368245496997, + "loss": 0.5808, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "grad_norm": 3.327956199645996, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "learning_rate": 0.00022349810769524436, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "loss": 0.4661470949649811, + "step": 8560 + }, + { + "ce_loss": 0.1542460322380066, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "distill_loss": 0.18235129117965698, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "ref_ce_loss": 0.08700333535671234, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "loss": 0.573223352432251, + "step": 8560 + }, + { + "ce_loss": 0.1726389080286026, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "distill_loss": 0.16355568170547485, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "ref_ce_loss": 0.13505931198596954, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "loss": 0.4333297610282898, + "step": 8560 + }, + { + "ce_loss": 0.1419287770986557, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "distill_loss": 0.19388234615325928, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "ref_ce_loss": 0.09745925664901733, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "loss": 0.5763939023017883, + "step": 8560 + }, + { + "ce_loss": 0.10823322832584381, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "distill_loss": 0.14647899568080902, + "epoch": 2.8552368245496997, + "step": 8560 + }, + { + "epoch": 2.8552368245496997, + "ref_ce_loss": 0.15130794048309326, + "step": 8560 + }, + { + "epoch": 2.858572381587725, + "loss": 0.5607, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "grad_norm": 2.5268304347991943, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "learning_rate": 0.00022332146195895735, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "loss": 0.4295397996902466, + "step": 8570 + }, + { + "ce_loss": 0.1499362289905548, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "distill_loss": 0.1379919946193695, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "ref_ce_loss": 0.10437241941690445, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "loss": 0.34164315462112427, + "step": 8570 + }, + { + "ce_loss": 0.08813903480768204, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "distill_loss": 0.11740319430828094, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "ref_ce_loss": 0.128663569688797, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "loss": 0.4116790294647217, + "step": 8570 + }, + { + "ce_loss": 0.1062823235988617, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "distill_loss": 0.11568897217512131, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "ref_ce_loss": 0.14106427133083344, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "loss": 0.44529253244400024, + "step": 8570 + }, + { + "ce_loss": 0.14703154563903809, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "distill_loss": 0.12601909041404724, + "epoch": 2.858572381587725, + "step": 8570 + }, + { + "epoch": 2.858572381587725, + "ref_ce_loss": 0.1380028873682022, + "step": 8570 + }, + { + "epoch": 2.8619079386257504, + "loss": 0.5535, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "grad_norm": 3.367523193359375, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "learning_rate": 0.00022314468251324673, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "loss": 0.3660839796066284, + "step": 8580 + }, + { + "ce_loss": 0.09687898308038712, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "distill_loss": 0.1386238932609558, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "ref_ce_loss": 0.09564144909381866, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "loss": 0.5628844499588013, + "step": 8580 + }, + { + "ce_loss": 0.20551376044750214, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "distill_loss": 0.13932254910469055, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "ref_ce_loss": 0.1882753074169159, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "loss": 0.3394158184528351, + "step": 8580 + }, + { + "ce_loss": 0.0823008194565773, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "distill_loss": 0.0964013859629631, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "ref_ce_loss": 0.09652000665664673, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "loss": 0.2966267466545105, + "step": 8580 + }, + { + "ce_loss": 0.10649916529655457, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "distill_loss": 0.08755770325660706, + "epoch": 2.8619079386257504, + "step": 8580 + }, + { + "epoch": 2.8619079386257504, + "ref_ce_loss": 0.10237345099449158, + "step": 8580 + }, + { + "epoch": 2.865243495663776, + "loss": 0.5546, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "grad_norm": 4.27254581451416, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "learning_rate": 0.0002229677696804884, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "loss": 0.5530468821525574, + "step": 8590 + }, + { + "ce_loss": 0.14216595888137817, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "distill_loss": 0.15951929986476898, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "ref_ce_loss": 0.12193866074085236, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "loss": 0.5057868957519531, + "step": 8590 + }, + { + "ce_loss": 0.14068260788917542, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "distill_loss": 0.15862120687961578, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "ref_ce_loss": 0.12405318766832352, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "loss": 0.5077252388000488, + "step": 8590 + }, + { + "ce_loss": 0.06671689450740814, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "distill_loss": 0.11470813304185867, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "ref_ce_loss": 0.08538176864385605, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "loss": 0.49542105197906494, + "step": 8590 + }, + { + "ce_loss": 0.1653415858745575, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "distill_loss": 0.14084473252296448, + "epoch": 2.865243495663776, + "step": 8590 + }, + { + "epoch": 2.865243495663776, + "ref_ce_loss": 0.11440587788820267, + "step": 8590 + }, + { + "epoch": 2.868579052701801, + "loss": 0.5217, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "grad_norm": 2.1853246688842773, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "learning_rate": 0.00022279072378330163, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "loss": 0.5349823832511902, + "step": 8600 + }, + { + "ce_loss": 0.21476206183433533, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "distill_loss": 0.12671056389808655, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "ref_ce_loss": 0.16497483849525452, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "loss": 0.5629706978797913, + "step": 8600 + }, + { + "ce_loss": 0.1847279965877533, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "distill_loss": 0.13843335211277008, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "ref_ce_loss": 0.14642126858234406, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "loss": 0.6087683439254761, + "step": 8600 + }, + { + "ce_loss": 0.15599875152111053, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "distill_loss": 0.12127721309661865, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "ref_ce_loss": 0.11146479099988937, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "loss": 0.48046231269836426, + "step": 8600 + }, + { + "ce_loss": 0.10944525897502899, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "distill_loss": 0.1440810263156891, + "epoch": 2.868579052701801, + "step": 8600 + }, + { + "epoch": 2.868579052701801, + "ref_ce_loss": 0.1533537060022354, + "step": 8600 + }, + { + "epoch": 2.8719146097398265, + "loss": 0.5264, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "grad_norm": 2.3612961769104004, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "learning_rate": 0.00022261354514454827, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "loss": 0.30287694931030273, + "step": 8610 + }, + { + "ce_loss": 0.05707375332713127, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "distill_loss": 0.08226659148931503, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "ref_ce_loss": 0.07613756507635117, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "loss": 0.6359444856643677, + "step": 8610 + }, + { + "ce_loss": 0.25475236773490906, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "distill_loss": 0.16380392014980316, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "ref_ce_loss": 0.18352565169334412, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "loss": 0.6667486429214478, + "step": 8610 + }, + { + "ce_loss": 0.26272544264793396, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "distill_loss": 0.08766535669565201, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "ref_ce_loss": 0.1847449392080307, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "loss": 0.5750166177749634, + "step": 8610 + }, + { + "ce_loss": 0.23870283365249634, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "distill_loss": 0.09349337220191956, + "epoch": 2.8719146097398265, + "step": 8610 + }, + { + "epoch": 2.8719146097398265, + "ref_ce_loss": 0.12696757912635803, + "step": 8610 + }, + { + "epoch": 2.875250166777852, + "loss": 0.5242, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "grad_norm": 2.3082547187805176, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "learning_rate": 0.0002224362340873323, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "loss": 0.40311920642852783, + "step": 8620 + }, + { + "ce_loss": 0.15748664736747742, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "distill_loss": 0.105941042304039, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "ref_ce_loss": 0.09454337507486343, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "loss": 0.6777150630950928, + "step": 8620 + }, + { + "ce_loss": 0.1892915666103363, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "distill_loss": 0.09386925399303436, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "ref_ce_loss": 0.13299092650413513, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "loss": 0.4655272960662842, + "step": 8620 + }, + { + "ce_loss": 0.13446469604969025, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "distill_loss": 0.09398121386766434, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "ref_ce_loss": 0.09203039854764938, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "loss": 0.27186840772628784, + "step": 8620 + }, + { + "ce_loss": 0.0731753557920456, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "distill_loss": 0.09868510812520981, + "epoch": 2.875250166777852, + "step": 8620 + }, + { + "epoch": 2.875250166777852, + "ref_ce_loss": 0.06487337499856949, + "step": 8620 + }, + { + "epoch": 2.878585723815877, + "loss": 0.5088, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "grad_norm": 2.4868619441986084, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "learning_rate": 0.000222258790934999, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "loss": 0.5052057504653931, + "step": 8630 + }, + { + "ce_loss": 0.12210213392972946, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "distill_loss": 0.09557357430458069, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "ref_ce_loss": 0.19164274632930756, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "loss": 0.8311383724212646, + "step": 8630 + }, + { + "ce_loss": 0.3233627378940582, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "distill_loss": 0.16602776944637299, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "ref_ce_loss": 0.16990318894386292, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "loss": 0.3017067611217499, + "step": 8630 + }, + { + "ce_loss": 0.06859374791383743, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "distill_loss": 0.11418160051107407, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "ref_ce_loss": 0.0983542650938034, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "loss": 0.45237842202186584, + "step": 8630 + }, + { + "ce_loss": 0.17673709988594055, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "distill_loss": 0.11338216066360474, + "epoch": 2.878585723815877, + "step": 8630 + }, + { + "epoch": 2.878585723815877, + "ref_ce_loss": 0.11751232296228409, + "step": 8630 + }, + { + "epoch": 2.8819212808539025, + "loss": 0.5389, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "grad_norm": 2.4058783054351807, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "learning_rate": 0.00022208121601113493, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "loss": 0.49721455574035645, + "step": 8640 + }, + { + "ce_loss": 0.1393839567899704, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "distill_loss": 0.12871664762496948, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "ref_ce_loss": 0.11616840958595276, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "loss": 0.5112259387969971, + "step": 8640 + }, + { + "ce_loss": 0.12560014426708221, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "distill_loss": 0.1277984380722046, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "ref_ce_loss": 0.11363084614276886, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "loss": 0.5464163422584534, + "step": 8640 + }, + { + "ce_loss": 0.20722423493862152, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "distill_loss": 0.1460934579372406, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "ref_ce_loss": 0.14361216127872467, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "loss": 0.42684420943260193, + "step": 8640 + }, + { + "ce_loss": 0.1484360545873642, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "distill_loss": 0.10624878108501434, + "epoch": 2.8819212808539025, + "step": 8640 + }, + { + "epoch": 2.8819212808539025, + "ref_ce_loss": 0.119191475212574, + "step": 8640 + }, + { + "epoch": 2.885256837891928, + "loss": 0.5271, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "grad_norm": 3.429231643676758, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "learning_rate": 0.00022190350963956652, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "loss": 0.3691232204437256, + "step": 8650 + }, + { + "ce_loss": 0.1358044445514679, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "distill_loss": 0.13041932880878448, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "ref_ce_loss": 0.10271897912025452, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "loss": 0.32777976989746094, + "step": 8650 + }, + { + "ce_loss": 0.12133575230836868, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "distill_loss": 0.10511480271816254, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "ref_ce_loss": 0.10117995738983154, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "loss": 0.39452067017555237, + "step": 8650 + }, + { + "ce_loss": 0.18165063858032227, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "distill_loss": 0.12682943046092987, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "ref_ce_loss": 0.08599083125591278, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "loss": 0.5932126045227051, + "step": 8650 + }, + { + "ce_loss": 0.18555493652820587, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "distill_loss": 0.11078682541847229, + "epoch": 2.885256837891928, + "step": 8650 + }, + { + "epoch": 2.885256837891928, + "ref_ce_loss": 0.12941141426563263, + "step": 8650 + }, + { + "epoch": 2.8885923949299532, + "loss": 0.5087, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "grad_norm": 3.141085386276245, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "learning_rate": 0.00022172567214436014, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "loss": 0.43682029843330383, + "step": 8660 + }, + { + "ce_loss": 0.18460293114185333, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "distill_loss": 0.09788291156291962, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "ref_ce_loss": 0.11332383006811142, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "loss": 0.783757209777832, + "step": 8660 + }, + { + "ce_loss": 0.22142735123634338, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "distill_loss": 0.1374686062335968, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "ref_ce_loss": 0.14957596361637115, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "loss": 0.4457992911338806, + "step": 8660 + }, + { + "ce_loss": 0.1515551507472992, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "distill_loss": 0.09355000406503677, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "ref_ce_loss": 0.1352471560239792, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "loss": 0.430512011051178, + "step": 8660 + }, + { + "ce_loss": 0.129452183842659, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "distill_loss": 0.1036761999130249, + "epoch": 2.8885923949299532, + "step": 8660 + }, + { + "epoch": 2.8885923949299532, + "ref_ce_loss": 0.10788404196500778, + "step": 8660 + }, + { + "epoch": 2.8919279519679786, + "loss": 0.5535, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "grad_norm": 3.5380215644836426, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "learning_rate": 0.0002215477038498213, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "loss": 0.6049115061759949, + "step": 8670 + }, + { + "ce_loss": 0.10785181075334549, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "distill_loss": 0.10674380511045456, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "ref_ce_loss": 0.13331040740013123, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "loss": 0.8881338238716125, + "step": 8670 + }, + { + "ce_loss": 0.23795866966247559, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "distill_loss": 0.13352984189987183, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "ref_ce_loss": 0.14402584731578827, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "loss": 0.4040985107421875, + "step": 8670 + }, + { + "ce_loss": 0.103199303150177, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "distill_loss": 0.09984700381755829, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "ref_ce_loss": 0.14553777873516083, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "loss": 0.34721651673316956, + "step": 8670 + }, + { + "ce_loss": 0.15223266184329987, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "distill_loss": 0.11905837804079056, + "epoch": 2.8919279519679786, + "step": 8670 + }, + { + "epoch": 2.8919279519679786, + "ref_ce_loss": 0.07565715163946152, + "step": 8670 + }, + { + "epoch": 2.895263509006004, + "loss": 0.4798, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "grad_norm": 4.128468036651611, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "learning_rate": 0.0002213696050804938, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "loss": 0.576320230960846, + "step": 8680 + }, + { + "ce_loss": 0.1257760226726532, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "distill_loss": 0.0989554151892662, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "ref_ce_loss": 0.10930144041776657, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "loss": 0.3506055474281311, + "step": 8680 + }, + { + "ce_loss": 0.06738583743572235, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "distill_loss": 0.12411016970872879, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "ref_ce_loss": 0.09874910861253738, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "loss": 0.5777968168258667, + "step": 8680 + }, + { + "ce_loss": 0.190630704164505, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "distill_loss": 0.11528972536325455, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "ref_ce_loss": 0.11596431583166122, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "loss": 0.6189478635787964, + "step": 8680 + }, + { + "ce_loss": 0.16328558325767517, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "distill_loss": 0.13485164940357208, + "epoch": 2.895263509006004, + "step": 8680 + }, + { + "epoch": 2.895263509006004, + "ref_ce_loss": 0.15076056122779846, + "step": 8680 + }, + { + "epoch": 2.8985990660440293, + "loss": 0.5104, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "grad_norm": 3.6191985607147217, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "learning_rate": 0.00022119137616115973, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "loss": 0.31480494141578674, + "step": 8690 + }, + { + "ce_loss": 0.09158436208963394, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "distill_loss": 0.1079840138554573, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "ref_ce_loss": 0.06985091418027878, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "loss": 1.2861382961273193, + "step": 8690 + }, + { + "ce_loss": 0.29323792457580566, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "distill_loss": 0.10477292537689209, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "ref_ce_loss": 0.18968039751052856, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "loss": 0.48443499207496643, + "step": 8690 + }, + { + "ce_loss": 0.2007952183485031, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "distill_loss": 0.0963059738278389, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "ref_ce_loss": 0.14208665490150452, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "loss": 0.3987194895744324, + "step": 8690 + }, + { + "ce_loss": 0.13310086727142334, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "distill_loss": 0.09960196912288666, + "epoch": 2.8985990660440293, + "step": 8690 + }, + { + "epoch": 2.8985990660440293, + "ref_ce_loss": 0.0725984126329422, + "step": 8690 + }, + { + "epoch": 2.9019346230820546, + "loss": 0.5389, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "grad_norm": 4.235092639923096, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "learning_rate": 0.0002210130174168382, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "loss": 0.32756808400154114, + "step": 8700 + }, + { + "ce_loss": 0.15331578254699707, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "distill_loss": 0.08755943179130554, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "ref_ce_loss": 0.08651743829250336, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "loss": 0.7419276237487793, + "step": 8700 + }, + { + "ce_loss": 0.1952676922082901, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "distill_loss": 0.08272796869277954, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "ref_ce_loss": 0.11741222441196442, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "loss": 0.42398780584335327, + "step": 8700 + }, + { + "ce_loss": 0.11199457943439484, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "distill_loss": 0.1243538111448288, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "ref_ce_loss": 0.11631476879119873, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "loss": 0.43274134397506714, + "step": 8700 + }, + { + "ce_loss": 0.19206500053405762, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "distill_loss": 0.1058146059513092, + "epoch": 2.9019346230820546, + "step": 8700 + }, + { + "epoch": 2.9019346230820546, + "ref_ce_loss": 0.09739596396684647, + "step": 8700 + }, + { + "epoch": 2.90527018012008, + "loss": 0.5265, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "grad_norm": 2.5261056423187256, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "learning_rate": 0.00022083452917278528, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "loss": 0.6119447350502014, + "step": 8710 + }, + { + "ce_loss": 0.16569578647613525, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "distill_loss": 0.09654286503791809, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "ref_ce_loss": 0.14469215273857117, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "loss": 0.42332643270492554, + "step": 8710 + }, + { + "ce_loss": 0.16344575583934784, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "distill_loss": 0.0723377913236618, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "ref_ce_loss": 0.15298517048358917, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "loss": 0.3651186227798462, + "step": 8710 + }, + { + "ce_loss": 0.12102488428354263, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "distill_loss": 0.08429036289453506, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "ref_ce_loss": 0.09180473536252975, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "loss": 0.5617263317108154, + "step": 8710 + }, + { + "ce_loss": 0.2145652323961258, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "distill_loss": 0.11610198765993118, + "epoch": 2.90527018012008, + "step": 8710 + }, + { + "epoch": 2.90527018012008, + "ref_ce_loss": 0.17532667517662048, + "step": 8710 + }, + { + "epoch": 2.9086057371581053, + "loss": 0.4692, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "grad_norm": 2.6236250400543213, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "learning_rate": 0.00022065591175449305, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "loss": 0.6628434658050537, + "step": 8720 + }, + { + "ce_loss": 0.23668566346168518, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "distill_loss": 0.13840903341770172, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "ref_ce_loss": 0.1786912977695465, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "loss": 0.35061073303222656, + "step": 8720 + }, + { + "ce_loss": 0.11093682795763016, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "distill_loss": 0.10949292778968811, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "ref_ce_loss": 0.0809154137969017, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "loss": 0.3698574900627136, + "step": 8720 + }, + { + "ce_loss": 0.12995347380638123, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "distill_loss": 0.11510443687438965, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "ref_ce_loss": 0.12470285594463348, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "loss": 0.332610160112381, + "step": 8720 + }, + { + "ce_loss": 0.13286007940769196, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "distill_loss": 0.09024094045162201, + "epoch": 2.9086057371581053, + "step": 8720 + }, + { + "epoch": 2.9086057371581053, + "ref_ce_loss": 0.10945945233106613, + "step": 8720 + }, + { + "epoch": 2.9119412941961307, + "loss": 0.4866, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "grad_norm": 2.74001145362854, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "learning_rate": 0.00022047716548768934, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "loss": 0.5908645391464233, + "step": 8730 + }, + { + "ce_loss": 0.15341097116470337, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "distill_loss": 0.09469150006771088, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "ref_ce_loss": 0.0913006067276001, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "loss": 0.3112438917160034, + "step": 8730 + }, + { + "ce_loss": 0.11778094619512558, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "distill_loss": 0.09843090176582336, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "ref_ce_loss": 0.09495732933282852, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "loss": 0.6640909910202026, + "step": 8730 + }, + { + "ce_loss": 0.12501554191112518, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "distill_loss": 0.1084146797657013, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "ref_ce_loss": 0.16660518944263458, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "loss": 0.3979160487651825, + "step": 8730 + }, + { + "ce_loss": 0.0966411754488945, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "distill_loss": 0.10279390215873718, + "epoch": 2.9119412941961307, + "step": 8730 + }, + { + "epoch": 2.9119412941961307, + "ref_ce_loss": 0.0864546075463295, + "step": 8730 + }, + { + "epoch": 2.915276851234156, + "loss": 0.5306, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "grad_norm": 4.032469272613525, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "learning_rate": 0.0002202982906983367, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "loss": 0.35388892889022827, + "step": 8740 + }, + { + "ce_loss": 0.11793344467878342, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "distill_loss": 0.09678888320922852, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "ref_ce_loss": 0.10981198400259018, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "loss": 0.2746281921863556, + "step": 8740 + }, + { + "ce_loss": 0.07362538576126099, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "distill_loss": 0.10906033217906952, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "ref_ce_loss": 0.09183456748723984, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "loss": 0.4105778932571411, + "step": 8740 + }, + { + "ce_loss": 0.12222662568092346, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "distill_loss": 0.11339154839515686, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "ref_ce_loss": 0.12201227992773056, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "loss": 0.5338358879089355, + "step": 8740 + }, + { + "ce_loss": 0.07666455209255219, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "distill_loss": 0.09451141953468323, + "epoch": 2.915276851234156, + "step": 8740 + }, + { + "epoch": 2.915276851234156, + "ref_ce_loss": 0.09166429936885834, + "step": 8740 + }, + { + "epoch": 2.9186124082721814, + "loss": 0.4786, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "grad_norm": 4.57519006729126, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "learning_rate": 0.00022011928771263227, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "loss": 0.4377441704273224, + "step": 8750 + }, + { + "ce_loss": 0.09397535771131516, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "distill_loss": 0.08135636150836945, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "ref_ce_loss": 0.0735674798488617, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "loss": 0.5145371556282043, + "step": 8750 + }, + { + "ce_loss": 0.18467354774475098, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "distill_loss": 0.1418350636959076, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "ref_ce_loss": 0.13718147575855255, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "loss": 0.43943333625793457, + "step": 8750 + }, + { + "ce_loss": 0.1606380194425583, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "distill_loss": 0.12608714401721954, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "ref_ce_loss": 0.11991623789072037, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "loss": 0.34229615330696106, + "step": 8750 + }, + { + "ce_loss": 0.13352441787719727, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "distill_loss": 0.11652510613203049, + "epoch": 2.9186124082721814, + "step": 8750 + }, + { + "epoch": 2.9186124082721814, + "ref_ce_loss": 0.09219865500926971, + "step": 8750 + }, + { + "epoch": 2.9219479653102067, + "loss": 0.5343, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "grad_norm": 2.5483882427215576, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "learning_rate": 0.00021994015685700686, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "loss": 0.45735830068588257, + "step": 8760 + }, + { + "ce_loss": 0.2106165885925293, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "distill_loss": 0.11669822037220001, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "ref_ce_loss": 0.08740370720624924, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "loss": 0.5426672101020813, + "step": 8760 + }, + { + "ce_loss": 0.14788727462291718, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "distill_loss": 0.08242390304803848, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "ref_ce_loss": 0.09694897383451462, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "loss": 0.5055618286132812, + "step": 8760 + }, + { + "ce_loss": 0.12635177373886108, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "distill_loss": 0.12514209747314453, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "ref_ce_loss": 0.18664951622486115, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "loss": 0.40916872024536133, + "step": 8760 + }, + { + "ce_loss": 0.17162492871284485, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "distill_loss": 0.11223581433296204, + "epoch": 2.9219479653102067, + "step": 8760 + }, + { + "epoch": 2.9219479653102067, + "ref_ce_loss": 0.08056915551424026, + "step": 8760 + }, + { + "epoch": 2.925283522348232, + "loss": 0.4792, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "grad_norm": 3.125194787979126, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "learning_rate": 0.00021976089845812438, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "loss": 0.8893252611160278, + "step": 8770 + }, + { + "ce_loss": 0.2755357027053833, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "distill_loss": 0.15055133402347565, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "ref_ce_loss": 0.17915786802768707, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "loss": 0.5901782512664795, + "step": 8770 + }, + { + "ce_loss": 0.08927714824676514, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "distill_loss": 0.09461453557014465, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "ref_ce_loss": 0.10167305171489716, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "loss": 0.495090514421463, + "step": 8770 + }, + { + "ce_loss": 0.1624784767627716, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "distill_loss": 0.138736754655838, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "ref_ce_loss": 0.14611607789993286, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "loss": 0.6977149248123169, + "step": 8770 + }, + { + "ce_loss": 0.1322113424539566, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "distill_loss": 0.11863041669130325, + "epoch": 2.925283522348232, + "step": 8770 + }, + { + "epoch": 2.925283522348232, + "ref_ce_loss": 0.11613546311855316, + "step": 8770 + }, + { + "epoch": 2.9286190793862574, + "loss": 0.5235, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "grad_norm": 3.383786201477051, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "learning_rate": 0.00021958151284288166, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "loss": 0.31101635098457336, + "step": 8780 + }, + { + "ce_loss": 0.12315681576728821, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "distill_loss": 0.10906527936458588, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "ref_ce_loss": 0.07862290740013123, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "loss": 0.5271515846252441, + "step": 8780 + }, + { + "ce_loss": 0.1860402375459671, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "distill_loss": 0.16446442902088165, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "ref_ce_loss": 0.1760958433151245, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "loss": 0.4236556887626648, + "step": 8780 + }, + { + "ce_loss": 0.17246517539024353, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "distill_loss": 0.09926985204219818, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "ref_ce_loss": 0.09923750162124634, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "loss": 0.4667871594429016, + "step": 8780 + }, + { + "ce_loss": 0.20223020017147064, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "distill_loss": 0.13251884281635284, + "epoch": 2.9286190793862574, + "step": 8780 + }, + { + "epoch": 2.9286190793862574, + "ref_ce_loss": 0.1317136138677597, + "step": 8780 + }, + { + "epoch": 2.931954636424283, + "loss": 0.4888, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "grad_norm": 2.1957993507385254, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "learning_rate": 0.00021940200033840714, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "loss": 0.4426005482673645, + "step": 8790 + }, + { + "ce_loss": 0.1768053025007248, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "distill_loss": 0.11535248160362244, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "ref_ce_loss": 0.10447856783866882, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "loss": 0.7331949472427368, + "step": 8790 + }, + { + "ce_loss": 0.2538256347179413, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "distill_loss": 0.1367458701133728, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "ref_ce_loss": 0.15786539018154144, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "loss": 0.2962862551212311, + "step": 8790 + }, + { + "ce_loss": 0.04074576124548912, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "distill_loss": 0.09793291985988617, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "ref_ce_loss": 0.10114602744579315, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "loss": 0.9041996002197266, + "step": 8790 + }, + { + "ce_loss": 0.18689587712287903, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "distill_loss": 0.1299264132976532, + "epoch": 2.931954636424283, + "step": 8790 + }, + { + "epoch": 2.931954636424283, + "ref_ce_loss": 0.17926687002182007, + "step": 8790 + }, + { + "epoch": 2.935290193462308, + "loss": 0.5363, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "grad_norm": 2.918760299682617, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "learning_rate": 0.00021922236127206083, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "loss": 0.4478914737701416, + "step": 8800 + }, + { + "ce_loss": 0.11208988726139069, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "distill_loss": 0.1034972220659256, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "ref_ce_loss": 0.14677636325359344, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "loss": 0.35105398297309875, + "step": 8800 + }, + { + "ce_loss": 0.09494055062532425, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "distill_loss": 0.11517714709043503, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "ref_ce_loss": 0.11402986943721771, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "loss": 0.523328423500061, + "step": 8800 + }, + { + "ce_loss": 0.20816966891288757, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "distill_loss": 0.15322241187095642, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "ref_ce_loss": 0.12490464001893997, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "loss": 0.474077433347702, + "step": 8800 + }, + { + "ce_loss": 0.19899730384349823, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "distill_loss": 0.10921809822320938, + "epoch": 2.935290193462308, + "step": 8800 + }, + { + "epoch": 2.935290193462308, + "ref_ce_loss": 0.12667261064052582, + "step": 8800 + }, + { + "epoch": 2.9386257505003335, + "loss": 0.5603, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "grad_norm": 3.7591745853424072, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "learning_rate": 0.00021904259597143357, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "loss": 0.28778815269470215, + "step": 8810 + }, + { + "ce_loss": 0.07531074434518814, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "distill_loss": 0.07752246409654617, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "ref_ce_loss": 0.13453662395477295, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "loss": 0.38257116079330444, + "step": 8810 + }, + { + "ce_loss": 0.1476069539785385, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "distill_loss": 0.11568806320428848, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "ref_ce_loss": 0.08604934066534042, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "loss": 0.5415005683898926, + "step": 8810 + }, + { + "ce_loss": 0.09510982036590576, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "distill_loss": 0.10478687286376953, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "ref_ce_loss": 0.10170865058898926, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "loss": 0.3948809504508972, + "step": 8810 + }, + { + "ce_loss": 0.0706266388297081, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "distill_loss": 0.09294119477272034, + "epoch": 2.9386257505003335, + "step": 8810 + }, + { + "epoch": 2.9386257505003335, + "ref_ce_loss": 0.0870620384812355, + "step": 8810 + }, + { + "epoch": 2.941961307538359, + "loss": 0.4867, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "grad_norm": 2.9408466815948486, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "learning_rate": 0.0002188627047643464, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "loss": 0.5317444205284119, + "step": 8820 + }, + { + "ce_loss": 0.2094917893409729, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "distill_loss": 0.10389654338359833, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "ref_ce_loss": 0.11777611076831818, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "loss": 0.3371651768684387, + "step": 8820 + }, + { + "ce_loss": 0.08623742312192917, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "distill_loss": 0.1031375601887703, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "ref_ce_loss": 0.1100323423743248, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "loss": 0.20720899105072021, + "step": 8820 + }, + { + "ce_loss": 0.043221935629844666, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "distill_loss": 0.0784808024764061, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "ref_ce_loss": 0.0510651059448719, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "loss": 0.2733457684516907, + "step": 8820 + }, + { + "ce_loss": 0.09705958515405655, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "distill_loss": 0.0906776711344719, + "epoch": 2.941961307538359, + "step": 8820 + }, + { + "epoch": 2.941961307538359, + "ref_ce_loss": 0.08549680560827255, + "step": 8820 + }, + { + "epoch": 2.945296864576384, + "loss": 0.5421, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "grad_norm": 2.729051113128662, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "learning_rate": 0.00021868268797884977, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "loss": 0.6006139516830444, + "step": 8830 + }, + { + "ce_loss": 0.16521619260311127, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "distill_loss": 0.11474260687828064, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "ref_ce_loss": 0.11602941900491714, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "loss": 0.3933120667934418, + "step": 8830 + }, + { + "ce_loss": 0.1256079524755478, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "distill_loss": 0.11178599298000336, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "ref_ce_loss": 0.11281760036945343, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "loss": 0.5145058035850525, + "step": 8830 + }, + { + "ce_loss": 0.2089313119649887, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "distill_loss": 0.12459740787744522, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "ref_ce_loss": 0.1330752670764923, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "loss": 0.4225755035877228, + "step": 8830 + }, + { + "ce_loss": 0.15203280746936798, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "distill_loss": 0.09559550881385803, + "epoch": 2.945296864576384, + "step": 8830 + }, + { + "epoch": 2.945296864576384, + "ref_ce_loss": 0.1744471788406372, + "step": 8830 + }, + { + "epoch": 2.9486324216144095, + "loss": 0.4838, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "grad_norm": 3.221513509750366, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "learning_rate": 0.00021850254594322344, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "loss": 0.5516033172607422, + "step": 8840 + }, + { + "ce_loss": 0.1504536122083664, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "distill_loss": 0.13644813001155853, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "ref_ce_loss": 0.08675995469093323, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "loss": 0.5576673150062561, + "step": 8840 + }, + { + "ce_loss": 0.16298459470272064, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "distill_loss": 0.13553835451602936, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "ref_ce_loss": 0.12538892030715942, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "loss": 0.8474454879760742, + "step": 8840 + }, + { + "ce_loss": 0.1506018489599228, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "distill_loss": 0.13082599639892578, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "ref_ce_loss": 0.14312021434307098, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "loss": 0.549639880657196, + "step": 8840 + }, + { + "ce_loss": 0.22064092755317688, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "distill_loss": 0.14355742931365967, + "epoch": 2.9486324216144095, + "step": 8840 + }, + { + "epoch": 2.9486324216144095, + "ref_ce_loss": 0.1570969969034195, + "step": 8840 + }, + { + "epoch": 2.951967978652435, + "loss": 0.5521, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "grad_norm": 2.628145217895508, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "learning_rate": 0.00021832227898597531, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "loss": 0.8192481994628906, + "step": 8850 + }, + { + "ce_loss": 0.12512989342212677, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "distill_loss": 0.12027587741613388, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "ref_ce_loss": 0.14873076975345612, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "loss": 0.4840548038482666, + "step": 8850 + }, + { + "ce_loss": 0.10911354422569275, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "distill_loss": 0.1427203267812729, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "ref_ce_loss": 0.14467257261276245, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "loss": 0.45594289898872375, + "step": 8850 + }, + { + "ce_loss": 0.12329255044460297, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "distill_loss": 0.11281190812587738, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "ref_ce_loss": 0.14958961308002472, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "loss": 0.34237849712371826, + "step": 8850 + }, + { + "ce_loss": 0.12874117493629456, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "distill_loss": 0.11954338848590851, + "epoch": 2.951967978652435, + "step": 8850 + }, + { + "epoch": 2.951967978652435, + "ref_ce_loss": 0.09373276680707932, + "step": 8850 + }, + { + "epoch": 2.9553035356904602, + "loss": 0.5088, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "grad_norm": 2.186516523361206, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "learning_rate": 0.00021814188743584127, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "loss": 0.3231074810028076, + "step": 8860 + }, + { + "ce_loss": 0.10812241584062576, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "distill_loss": 0.1252068430185318, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "ref_ce_loss": 0.08961108326911926, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "loss": 0.6042312383651733, + "step": 8860 + }, + { + "ce_loss": 0.22129030525684357, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "distill_loss": 0.11306517571210861, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "ref_ce_loss": 0.1952359974384308, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "loss": 0.35771751403808594, + "step": 8860 + }, + { + "ce_loss": 0.14291593432426453, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "distill_loss": 0.121894471347332, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "ref_ce_loss": 0.09281952679157257, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "loss": 0.5629582405090332, + "step": 8860 + }, + { + "ce_loss": 0.13195520639419556, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "distill_loss": 0.1128406897187233, + "epoch": 2.9553035356904602, + "step": 8860 + }, + { + "epoch": 2.9553035356904602, + "ref_ce_loss": 0.11142636090517044, + "step": 8860 + }, + { + "epoch": 2.9586390927284856, + "loss": 0.5349, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "grad_norm": 3.5918776988983154, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "learning_rate": 0.00021796137162178434, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "loss": 0.9226340055465698, + "step": 8870 + }, + { + "ce_loss": 0.1620820164680481, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "distill_loss": 0.11741302907466888, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "ref_ce_loss": 0.13054203987121582, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "loss": 0.5688709020614624, + "step": 8870 + }, + { + "ce_loss": 0.17032210528850555, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "distill_loss": 0.09201649576425552, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "ref_ce_loss": 0.1125497967004776, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "loss": 0.833220362663269, + "step": 8870 + }, + { + "ce_loss": 0.1887754648923874, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "distill_loss": 0.12600192427635193, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "ref_ce_loss": 0.11542544513940811, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "loss": 0.4784468114376068, + "step": 8870 + }, + { + "ce_loss": 0.18221917748451233, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "distill_loss": 0.12532924115657806, + "epoch": 2.9586390927284856, + "step": 8870 + }, + { + "epoch": 2.9586390927284856, + "ref_ce_loss": 0.12261589616537094, + "step": 8870 + }, + { + "epoch": 2.961974649766511, + "loss": 0.5122, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "grad_norm": 4.213609218597412, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "learning_rate": 0.0002177807318729941, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "loss": 0.5542953014373779, + "step": 8880 + }, + { + "ce_loss": 0.17619939148426056, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "distill_loss": 0.1372058242559433, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "ref_ce_loss": 0.12143444269895554, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "loss": 0.3381030857563019, + "step": 8880 + }, + { + "ce_loss": 0.08107311278581619, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "distill_loss": 0.08807455003261566, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "ref_ce_loss": 0.1255367249250412, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "loss": 0.5628779530525208, + "step": 8880 + }, + { + "ce_loss": 0.11000987887382507, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "distill_loss": 0.1430390328168869, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "ref_ce_loss": 0.12550552189350128, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "loss": 0.5493335723876953, + "step": 8880 + }, + { + "ce_loss": 0.16359969973564148, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "distill_loss": 0.10959623754024506, + "epoch": 2.961974649766511, + "step": 8880 + }, + { + "epoch": 2.961974649766511, + "ref_ce_loss": 0.1296955943107605, + "step": 8880 + }, + { + "epoch": 2.9653102068045363, + "loss": 0.5355, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "grad_norm": 4.092010021209717, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "learning_rate": 0.0002175999685188863, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "loss": 0.5732136368751526, + "step": 8890 + }, + { + "ce_loss": 0.14554201066493988, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "distill_loss": 0.1008024662733078, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "ref_ce_loss": 0.08674776554107666, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "loss": 0.8132926225662231, + "step": 8890 + }, + { + "ce_loss": 0.1978740245103836, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "distill_loss": 0.11106804758310318, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "ref_ce_loss": 0.15045933425426483, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "loss": 0.6142012476921082, + "step": 8890 + }, + { + "ce_loss": 0.1520072966814041, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "distill_loss": 0.11832404136657715, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "ref_ce_loss": 0.06626222282648087, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "loss": 0.4120936691761017, + "step": 8890 + }, + { + "ce_loss": 0.16986437141895294, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "distill_loss": 0.11584517359733582, + "epoch": 2.9653102068045363, + "step": 8890 + }, + { + "epoch": 2.9653102068045363, + "ref_ce_loss": 0.09480530768632889, + "step": 8890 + }, + { + "epoch": 2.9686457638425616, + "loss": 0.5003, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "grad_norm": 2.9350826740264893, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "learning_rate": 0.00021741908188910192, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "loss": 0.6007994413375854, + "step": 8900 + }, + { + "ce_loss": 0.1910446435213089, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "distill_loss": 0.13157133758068085, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "ref_ce_loss": 0.10696756839752197, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "loss": 0.591404139995575, + "step": 8900 + }, + { + "ce_loss": 0.1629049926996231, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "distill_loss": 0.14794319868087769, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "ref_ce_loss": 0.1411333680152893, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "loss": 0.4213687479496002, + "step": 8900 + }, + { + "ce_loss": 0.1346403956413269, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "distill_loss": 0.11285700649023056, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "ref_ce_loss": 0.1737578809261322, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "loss": 0.5505105257034302, + "step": 8900 + }, + { + "ce_loss": 0.15355099737644196, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "distill_loss": 0.096382737159729, + "epoch": 2.9686457638425616, + "step": 8900 + }, + { + "epoch": 2.9686457638425616, + "ref_ce_loss": 0.12684959173202515, + "step": 8900 + }, + { + "epoch": 2.971981320880587, + "loss": 0.5263, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "grad_norm": 3.6547932624816895, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "learning_rate": 0.00021723807231350685, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "loss": 0.5092654824256897, + "step": 8910 + }, + { + "ce_loss": 0.1908111721277237, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "distill_loss": 0.16151322424411774, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "ref_ce_loss": 0.11491622030735016, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "loss": 0.4845525026321411, + "step": 8910 + }, + { + "ce_loss": 0.17053207755088806, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "distill_loss": 0.11018142849206924, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "ref_ce_loss": 0.11159297078847885, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "loss": 0.4210514426231384, + "step": 8910 + }, + { + "ce_loss": 0.09162740409374237, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "distill_loss": 0.07755187153816223, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "ref_ce_loss": 0.07986513525247574, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "loss": 0.6294732689857483, + "step": 8910 + }, + { + "ce_loss": 0.18695926666259766, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "distill_loss": 0.10205936431884766, + "epoch": 2.971981320880587, + "step": 8910 + }, + { + "epoch": 2.971981320880587, + "ref_ce_loss": 0.14748182892799377, + "step": 8910 + }, + { + "epoch": 2.9753168779186123, + "loss": 0.4895, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "grad_norm": 3.938183069229126, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "learning_rate": 0.00021705694012219106, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "loss": 0.31843438744544983, + "step": 8920 + }, + { + "ce_loss": 0.1231832280755043, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "distill_loss": 0.10385991632938385, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "ref_ce_loss": 0.09056884795427322, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "loss": 0.38025349378585815, + "step": 8920 + }, + { + "ce_loss": 0.1042127013206482, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "distill_loss": 0.09548382461071014, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "ref_ce_loss": 0.073456771671772, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "loss": 0.5087049007415771, + "step": 8920 + }, + { + "ce_loss": 0.21662679314613342, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "distill_loss": 0.1301024705171585, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "ref_ce_loss": 0.12792813777923584, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "loss": 0.4105304777622223, + "step": 8920 + }, + { + "ce_loss": 0.08565964549779892, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "distill_loss": 0.11993283033370972, + "epoch": 2.9753168779186123, + "step": 8920 + }, + { + "epoch": 2.9753168779186123, + "ref_ce_loss": 0.13341321051120758, + "step": 8920 + }, + { + "epoch": 2.9786524349566377, + "loss": 0.559, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "grad_norm": 3.1146602630615234, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "learning_rate": 0.00021687568564546838, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "loss": 0.3035496771335602, + "step": 8930 + }, + { + "ce_loss": 0.09508823603391647, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "distill_loss": 0.09790746867656708, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "ref_ce_loss": 0.08749634027481079, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "loss": 0.44061118364334106, + "step": 8930 + }, + { + "ce_loss": 0.13150790333747864, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "distill_loss": 0.14861956238746643, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "ref_ce_loss": 0.12680856883525848, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "loss": 0.6023802757263184, + "step": 8930 + }, + { + "ce_loss": 0.18394355475902557, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "distill_loss": 0.12831705808639526, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "ref_ce_loss": 0.11957930773496628, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "loss": 0.5828354954719543, + "step": 8930 + }, + { + "ce_loss": 0.21842427551746368, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "distill_loss": 0.1230367124080658, + "epoch": 2.9786524349566377, + "step": 8930 + }, + { + "epoch": 2.9786524349566377, + "ref_ce_loss": 0.1871989518404007, + "step": 8930 + }, + { + "epoch": 2.981987991994663, + "loss": 0.4852, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "grad_norm": 2.2136104106903076, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "learning_rate": 0.00021669430921387534, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "loss": 0.493175208568573, + "step": 8940 + }, + { + "ce_loss": 0.2051461786031723, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "distill_loss": 0.13799598813056946, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "ref_ce_loss": 0.14984993636608124, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "loss": 0.5004106760025024, + "step": 8940 + }, + { + "ce_loss": 0.1794445663690567, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "distill_loss": 0.118271604180336, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "ref_ce_loss": 0.1278836578130722, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "loss": 0.2564985454082489, + "step": 8940 + }, + { + "ce_loss": 0.10331545770168304, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "distill_loss": 0.09063772112131119, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "ref_ce_loss": 0.045049406588077545, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "loss": 0.5444272756576538, + "step": 8940 + }, + { + "ce_loss": 0.13070593774318695, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "distill_loss": 0.1038922667503357, + "epoch": 2.981987991994663, + "step": 8940 + }, + { + "epoch": 2.981987991994663, + "ref_ce_loss": 0.07918254286050797, + "step": 8940 + }, + { + "epoch": 2.9853235490326884, + "loss": 0.516, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "grad_norm": 2.4029927253723145, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "learning_rate": 0.00021651281115817102, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "loss": 0.5528447031974792, + "step": 8950 + }, + { + "ce_loss": 0.24434742331504822, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "distill_loss": 0.13547056913375854, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "ref_ce_loss": 0.11006741225719452, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "loss": 0.3580153286457062, + "step": 8950 + }, + { + "ce_loss": 0.11471273005008698, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "distill_loss": 0.11067156493663788, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "ref_ce_loss": 0.1322018951177597, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "loss": 0.2604046165943146, + "step": 8950 + }, + { + "ce_loss": 0.10595227032899857, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "distill_loss": 0.09654046595096588, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "ref_ce_loss": 0.05750226601958275, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "loss": 0.8750475645065308, + "step": 8950 + }, + { + "ce_loss": 0.13068419694900513, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "distill_loss": 0.11134978383779526, + "epoch": 2.9853235490326884, + "step": 8950 + }, + { + "epoch": 2.9853235490326884, + "ref_ce_loss": 0.10884664952754974, + "step": 8950 + }, + { + "epoch": 2.9886591060707137, + "loss": 0.5772, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "grad_norm": 2.57700514793396, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "learning_rate": 0.00021633119180933634, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "loss": 0.354244202375412, + "step": 8960 + }, + { + "ce_loss": 0.09961654990911484, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "distill_loss": 0.11927730590105057, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "ref_ce_loss": 0.09512662887573242, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "loss": 0.4154999852180481, + "step": 8960 + }, + { + "ce_loss": 0.1644178330898285, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "distill_loss": 0.12879574298858643, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "ref_ce_loss": 0.12126053869724274, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "loss": 0.733989417552948, + "step": 8960 + }, + { + "ce_loss": 0.19184887409210205, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "distill_loss": 0.12150511890649796, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "ref_ce_loss": 0.17909909784793854, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "loss": 0.7652954459190369, + "step": 8960 + }, + { + "ce_loss": 0.1274799257516861, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "distill_loss": 0.10142847150564194, + "epoch": 2.9886591060707137, + "step": 8960 + }, + { + "epoch": 2.9886591060707137, + "ref_ce_loss": 0.16671831905841827, + "step": 8960 + }, + { + "epoch": 2.991994663108739, + "loss": 0.4935, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "grad_norm": 2.5278472900390625, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "learning_rate": 0.00021614945149857334, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "loss": 0.3677584230899811, + "step": 8970 + }, + { + "ce_loss": 0.14266683161258698, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "distill_loss": 0.14370639622211456, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "ref_ce_loss": 0.08122113347053528, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "loss": 0.45084571838378906, + "step": 8970 + }, + { + "ce_loss": 0.0990084707736969, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "distill_loss": 0.11535260081291199, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "ref_ce_loss": 0.10417357832193375, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "loss": 0.38456830382347107, + "step": 8970 + }, + { + "ce_loss": 0.1252870112657547, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "distill_loss": 0.11813102662563324, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "ref_ce_loss": 0.1055547371506691, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "loss": 0.47269147634506226, + "step": 8970 + }, + { + "ce_loss": 0.22794851660728455, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "distill_loss": 0.11512558162212372, + "epoch": 2.991994663108739, + "step": 8970 + }, + { + "epoch": 2.991994663108739, + "ref_ce_loss": 0.12925688922405243, + "step": 8970 + }, + { + "epoch": 2.9953302201467644, + "loss": 0.5518, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "grad_norm": 3.004040002822876, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "learning_rate": 0.00021596759055730465, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "loss": 0.45926421880722046, + "step": 8980 + }, + { + "ce_loss": 0.20763225853443146, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "distill_loss": 0.15198659896850586, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "ref_ce_loss": 0.09958194941282272, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "loss": 0.3914671242237091, + "step": 8980 + }, + { + "ce_loss": 0.1506868302822113, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "distill_loss": 0.11587530374526978, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "ref_ce_loss": 0.07094470411539078, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "loss": 0.4542044699192047, + "step": 8980 + }, + { + "ce_loss": 0.16601303219795227, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "distill_loss": 0.1144561842083931, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "ref_ce_loss": 0.09616707265377045, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "loss": 0.8602047562599182, + "step": 8980 + }, + { + "ce_loss": 0.18182548880577087, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "distill_loss": 0.13099931180477142, + "epoch": 2.9953302201467644, + "step": 8980 + }, + { + "epoch": 2.9953302201467644, + "ref_ce_loss": 0.15818721055984497, + "step": 8980 + }, + { + "epoch": 2.9986657771847898, + "loss": 0.5197, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "grad_norm": 3.4914965629577637, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "learning_rate": 0.0002157856093171728, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "loss": 0.8473386764526367, + "step": 8990 + }, + { + "ce_loss": 0.1329667866230011, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "distill_loss": 0.10136765241622925, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "ref_ce_loss": 0.12561270594596863, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "loss": 0.3459990918636322, + "step": 8990 + }, + { + "ce_loss": 0.09148216247558594, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "distill_loss": 0.08927734196186066, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "ref_ce_loss": 0.16515298187732697, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "loss": 0.2844806909561157, + "step": 8990 + }, + { + "ce_loss": 0.10032972693443298, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "distill_loss": 0.12092792987823486, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "ref_ce_loss": 0.063165083527565, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "loss": 0.5512446761131287, + "step": 8990 + }, + { + "ce_loss": 0.17064183950424194, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "distill_loss": 0.10145299881696701, + "epoch": 2.9986657771847898, + "step": 8990 + }, + { + "epoch": 2.9986657771847898, + "ref_ce_loss": 0.0986415445804596, + "step": 8990 + }, + { + "epoch": 3.002001334222815, + "loss": 0.4782, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "grad_norm": 2.815939426422119, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "learning_rate": 0.0002156035081100399, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "loss": 0.5625020861625671, + "step": 9000 + }, + { + "ce_loss": 0.15781192481517792, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "distill_loss": 0.15901319682598114, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "ref_ce_loss": 0.14345714449882507, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "loss": 0.30363473296165466, + "step": 9000 + }, + { + "ce_loss": 0.12738262116909027, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "distill_loss": 0.11215686798095703, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "ref_ce_loss": 0.06402139365673065, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "loss": 0.4059531092643738, + "step": 9000 + }, + { + "ce_loss": 0.11209169775247574, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "distill_loss": 0.13779860734939575, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "ref_ce_loss": 0.11790633201599121, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "loss": 0.5732775926589966, + "step": 9000 + }, + { + "ce_loss": 0.2084646373987198, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "distill_loss": 0.1004604697227478, + "epoch": 3.002001334222815, + "step": 9000 + }, + { + "epoch": 3.002001334222815, + "ref_ce_loss": 0.11193086206912994, + "step": 9000 + }, + { + "epoch": 3.0053368912608405, + "loss": 0.4867, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "grad_norm": 2.182608127593994, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "learning_rate": 0.0002154212872679867, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "loss": 0.7315200567245483, + "step": 9010 + }, + { + "ce_loss": 0.2323620617389679, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "distill_loss": 0.1421075165271759, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "ref_ce_loss": 0.12837104499340057, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "loss": 0.410982608795166, + "step": 9010 + }, + { + "ce_loss": 0.19520390033721924, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "distill_loss": 0.12332181632518768, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "ref_ce_loss": 0.09218749403953552, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "loss": 0.4925827383995056, + "step": 9010 + }, + { + "ce_loss": 0.2317257672548294, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "distill_loss": 0.15137000381946564, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "ref_ce_loss": 0.10924065113067627, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "loss": 0.4396410584449768, + "step": 9010 + }, + { + "ce_loss": 0.16249550879001617, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "distill_loss": 0.14607492089271545, + "epoch": 3.0053368912608405, + "step": 9010 + }, + { + "epoch": 3.0053368912608405, + "ref_ce_loss": 0.09223807603120804, + "step": 9010 + }, + { + "epoch": 3.008672448298866, + "loss": 0.5155, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "grad_norm": 1.9353896379470825, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "learning_rate": 0.00021523894712331215, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "loss": 0.3911009430885315, + "step": 9020 + }, + { + "ce_loss": 0.0835895836353302, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "distill_loss": 0.16284173727035522, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "ref_ce_loss": 0.11544522643089294, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "loss": 0.4121594727039337, + "step": 9020 + }, + { + "ce_loss": 0.10628797113895416, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "distill_loss": 0.0858006551861763, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "ref_ce_loss": 0.14815564453601837, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "loss": 0.4245898723602295, + "step": 9020 + }, + { + "ce_loss": 0.10541052371263504, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "distill_loss": 0.15007489919662476, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "ref_ce_loss": 0.1281569004058838, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "loss": 0.394951194524765, + "step": 9020 + }, + { + "ce_loss": 0.09292899072170258, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "distill_loss": 0.1514538824558258, + "epoch": 3.008672448298866, + "step": 9020 + }, + { + "epoch": 3.008672448298866, + "ref_ce_loss": 0.11690583825111389, + "step": 9020 + }, + { + "epoch": 3.012008005336891, + "loss": 0.4817, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "grad_norm": 3.1654815673828125, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "learning_rate": 0.00021505648800853263, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "loss": 0.32135191559791565, + "step": 9030 + }, + { + "ce_loss": 0.10678337514400482, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "distill_loss": 0.0853194147348404, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "ref_ce_loss": 0.066754050552845, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "loss": 0.4086533188819885, + "step": 9030 + }, + { + "ce_loss": 0.08803881704807281, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "distill_loss": 0.1329783946275711, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "ref_ce_loss": 0.12893329560756683, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "loss": 0.6685104370117188, + "step": 9030 + }, + { + "ce_loss": 0.15491558611392975, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "distill_loss": 0.28164738416671753, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "ref_ce_loss": 0.11756544560194016, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "loss": 0.555343508720398, + "step": 9030 + }, + { + "ce_loss": 0.16929669678211212, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "distill_loss": 0.13908150792121887, + "epoch": 3.012008005336891, + "step": 9030 + }, + { + "epoch": 3.012008005336891, + "ref_ce_loss": 0.1195400282740593, + "step": 9030 + }, + { + "epoch": 3.0153435623749165, + "loss": 0.5041, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "grad_norm": 1.995711088180542, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "learning_rate": 0.00021487391025638172, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "loss": 0.3834221661090851, + "step": 9040 + }, + { + "ce_loss": 0.17786496877670288, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "distill_loss": 0.11993467062711716, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "ref_ce_loss": 0.08478295803070068, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "loss": 0.4656774699687958, + "step": 9040 + }, + { + "ce_loss": 0.1686105728149414, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "distill_loss": 0.15664944052696228, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "ref_ce_loss": 0.09069463610649109, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "loss": 0.28203755617141724, + "step": 9040 + }, + { + "ce_loss": 0.054809946566820145, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "distill_loss": 0.1106935366988182, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "ref_ce_loss": 0.07380043715238571, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "loss": 0.6601328253746033, + "step": 9040 + }, + { + "ce_loss": 0.18228961527347565, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "distill_loss": 0.14953771233558655, + "epoch": 3.0153435623749165, + "step": 9040 + }, + { + "epoch": 3.0153435623749165, + "ref_ce_loss": 0.11164674162864685, + "step": 9040 + }, + { + "epoch": 3.018679119412942, + "loss": 0.4816, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "grad_norm": 3.625382423400879, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "learning_rate": 0.00021469121419980916, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "loss": 0.4787139296531677, + "step": 9050 + }, + { + "ce_loss": 0.20587719976902008, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "distill_loss": 0.16615983843803406, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "ref_ce_loss": 0.10627803951501846, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "loss": 0.5185806751251221, + "step": 9050 + }, + { + "ce_loss": 0.11246607452630997, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "distill_loss": 0.1460881382226944, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "ref_ce_loss": 0.08953309059143066, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "loss": 0.986879289150238, + "step": 9050 + }, + { + "ce_loss": 0.14453230798244476, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "distill_loss": 0.1707886904478073, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "ref_ce_loss": 0.12012699991464615, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "loss": 0.46279260516166687, + "step": 9050 + }, + { + "ce_loss": 0.14909015595912933, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "distill_loss": 0.17477688193321228, + "epoch": 3.018679119412942, + "step": 9050 + }, + { + "epoch": 3.018679119412942, + "ref_ce_loss": 0.10666271299123764, + "step": 9050 + }, + { + "epoch": 3.022014676450967, + "loss": 0.5947, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "grad_norm": 3.0756499767303467, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "learning_rate": 0.00021450840017198049, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "loss": 0.5729109644889832, + "step": 9060 + }, + { + "ce_loss": 0.23215997219085693, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "distill_loss": 0.18815375864505768, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "ref_ce_loss": 0.11127826571464539, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "loss": 0.7342166900634766, + "step": 9060 + }, + { + "ce_loss": 0.12690752744674683, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "distill_loss": 0.18578952550888062, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "ref_ce_loss": 0.07483983039855957, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "loss": 0.7186385989189148, + "step": 9060 + }, + { + "ce_loss": 0.19234232604503632, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "distill_loss": 0.2232697308063507, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "ref_ce_loss": 0.11886679381132126, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "loss": 0.5786302089691162, + "step": 9060 + }, + { + "ce_loss": 0.1808573454618454, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "distill_loss": 0.14951898157596588, + "epoch": 3.022014676450967, + "step": 9060 + }, + { + "epoch": 3.022014676450967, + "ref_ce_loss": 0.14975999295711517, + "step": 9060 + }, + { + "epoch": 3.0253502334889926, + "loss": 0.5207, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "grad_norm": 2.4221537113189697, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "learning_rate": 0.0002143254685062764, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "loss": 0.462746798992157, + "step": 9070 + }, + { + "ce_loss": 0.08939208835363388, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "distill_loss": 0.10813222080469131, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "ref_ce_loss": 0.1629054695367813, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "loss": 0.36405521631240845, + "step": 9070 + }, + { + "ce_loss": 0.11861259490251541, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "distill_loss": 0.08977380394935608, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "ref_ce_loss": 0.10912643373012543, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "loss": 0.4217601716518402, + "step": 9070 + }, + { + "ce_loss": 0.1625213623046875, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "distill_loss": 0.1497325301170349, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "ref_ce_loss": 0.10908317565917969, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "loss": 0.36443212628364563, + "step": 9070 + }, + { + "ce_loss": 0.10747730731964111, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "distill_loss": 0.10043259710073471, + "epoch": 3.0253502334889926, + "step": 9070 + }, + { + "epoch": 3.0253502334889926, + "ref_ce_loss": 0.1053435280919075, + "step": 9070 + }, + { + "epoch": 3.028685790527018, + "loss": 0.4974, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "grad_norm": 2.5194292068481445, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "learning_rate": 0.0002141424195362921, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "loss": 0.38930782675743103, + "step": 9080 + }, + { + "ce_loss": 0.11574207991361618, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "distill_loss": 0.14219531416893005, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "ref_ce_loss": 0.09605050086975098, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "loss": 0.45022615790367126, + "step": 9080 + }, + { + "ce_loss": 0.09811569005250931, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "distill_loss": 0.1672498881816864, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "ref_ce_loss": 0.13437940180301666, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "loss": 0.5666844844818115, + "step": 9080 + }, + { + "ce_loss": 0.1607423722743988, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "distill_loss": 0.1925409585237503, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "ref_ce_loss": 0.15472768247127533, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "loss": 0.4131091833114624, + "step": 9080 + }, + { + "ce_loss": 0.11376647651195526, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "distill_loss": 0.1661110818386078, + "epoch": 3.028685790527018, + "step": 9080 + }, + { + "epoch": 3.028685790527018, + "ref_ce_loss": 0.06726560741662979, + "step": 9080 + }, + { + "epoch": 3.0320213475650433, + "loss": 0.5143, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "grad_norm": 2.647369146347046, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "learning_rate": 0.00021395925359583666, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "loss": 0.6554050445556641, + "step": 9090 + }, + { + "ce_loss": 0.07042604684829712, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "distill_loss": 0.08944550156593323, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "ref_ce_loss": 0.08455150574445724, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "loss": 0.40479806065559387, + "step": 9090 + }, + { + "ce_loss": 0.1453285664319992, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "distill_loss": 0.11391732096672058, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "ref_ce_loss": 0.11537247151136398, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "loss": 0.42904889583587646, + "step": 9090 + }, + { + "ce_loss": 0.09523611515760422, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "distill_loss": 0.080131895840168, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "ref_ce_loss": 0.07932931184768677, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "loss": 0.6467583179473877, + "step": 9090 + }, + { + "ce_loss": 0.12051200866699219, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "distill_loss": 0.13366849720478058, + "epoch": 3.0320213475650433, + "step": 9090 + }, + { + "epoch": 3.0320213475650433, + "ref_ce_loss": 0.1472989320755005, + "step": 9090 + }, + { + "epoch": 3.0353569046030686, + "loss": 0.4952, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "grad_norm": 2.8011693954467773, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "learning_rate": 0.00021377597101893256, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "loss": 0.42988431453704834, + "step": 9100 + }, + { + "ce_loss": 0.17967963218688965, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "distill_loss": 0.11916738003492355, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "ref_ce_loss": 0.10457290709018707, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "loss": 0.47870928049087524, + "step": 9100 + }, + { + "ce_loss": 0.12809933722019196, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "distill_loss": 0.0908149853348732, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "ref_ce_loss": 0.07106465846300125, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "loss": 0.49013468623161316, + "step": 9100 + }, + { + "ce_loss": 0.09937848895788193, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "distill_loss": 0.0903325080871582, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "ref_ce_loss": 0.1495119333267212, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "loss": 0.49605005979537964, + "step": 9100 + }, + { + "ce_loss": 0.11459803581237793, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "distill_loss": 0.09380602091550827, + "epoch": 3.0353569046030686, + "step": 9100 + }, + { + "epoch": 3.0353569046030686, + "ref_ce_loss": 0.06981277465820312, + "step": 9100 + }, + { + "epoch": 3.038692461641094, + "loss": 0.501, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "grad_norm": 3.9146056175231934, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "learning_rate": 0.00021359257213981485, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "loss": 0.9335725903511047, + "step": 9110 + }, + { + "ce_loss": 0.21147924661636353, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "distill_loss": 0.17345088720321655, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "ref_ce_loss": 0.10016033053398132, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "loss": 0.5776867866516113, + "step": 9110 + }, + { + "ce_loss": 0.24646618962287903, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "distill_loss": 0.1633155643939972, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "ref_ce_loss": 0.1673925817012787, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "loss": 0.5697858333587646, + "step": 9110 + }, + { + "ce_loss": 0.10325392335653305, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "distill_loss": 0.26549357175827026, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "ref_ce_loss": 0.13094070553779602, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "loss": 0.44562843441963196, + "step": 9110 + }, + { + "ce_loss": 0.11659137159585953, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "distill_loss": 0.2210933119058609, + "epoch": 3.038692461641094, + "step": 9110 + }, + { + "epoch": 3.038692461641094, + "ref_ce_loss": 0.10697054117918015, + "step": 9110 + }, + { + "epoch": 3.0420280186791193, + "loss": 0.5368, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "grad_norm": 3.2923338413238525, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "learning_rate": 0.00021340905729293078, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "loss": 0.7003005743026733, + "step": 9120 + }, + { + "ce_loss": 0.2035401165485382, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "distill_loss": 0.12703534960746765, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "ref_ce_loss": 0.1929311752319336, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "loss": 0.32534945011138916, + "step": 9120 + }, + { + "ce_loss": 0.09824029356241226, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "distill_loss": 0.0958671048283577, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "ref_ce_loss": 0.0892966240644455, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "loss": 0.38903501629829407, + "step": 9120 + }, + { + "ce_loss": 0.15053687989711761, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "distill_loss": 0.11209321022033691, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "ref_ce_loss": 0.07555312663316727, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "loss": 0.3890794515609741, + "step": 9120 + }, + { + "ce_loss": 0.07411421835422516, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "distill_loss": 0.11061069369316101, + "epoch": 3.0420280186791193, + "step": 9120 + }, + { + "epoch": 3.0420280186791193, + "ref_ce_loss": 0.09935502707958221, + "step": 9120 + }, + { + "epoch": 3.0453635757171447, + "loss": 0.4967, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "grad_norm": 2.6918272972106934, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "learning_rate": 0.00021322542681293904, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "loss": 0.7495424747467041, + "step": 9130 + }, + { + "ce_loss": 0.14578987658023834, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "distill_loss": 0.09604738652706146, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "ref_ce_loss": 0.07308385521173477, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "loss": 0.7880163788795471, + "step": 9130 + }, + { + "ce_loss": 0.12381213158369064, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "distill_loss": 0.1534123718738556, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "ref_ce_loss": 0.11481275409460068, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "loss": 0.3943750262260437, + "step": 9130 + }, + { + "ce_loss": 0.12776845693588257, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "distill_loss": 0.15161047875881195, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "ref_ce_loss": 0.06874366849660873, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "loss": 0.5388900637626648, + "step": 9130 + }, + { + "ce_loss": 0.22864705324172974, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "distill_loss": 0.1401529610157013, + "epoch": 3.0453635757171447, + "step": 9130 + }, + { + "epoch": 3.0453635757171447, + "ref_ce_loss": 0.13798050582408905, + "step": 9130 + }, + { + "epoch": 3.04869913275517, + "loss": 0.4863, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "grad_norm": 2.592088460922241, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "learning_rate": 0.0002130416810347092, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "loss": 0.5827805995941162, + "step": 9140 + }, + { + "ce_loss": 0.1532483845949173, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "distill_loss": 0.1079009547829628, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "ref_ce_loss": 0.09535907208919525, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "loss": 0.45961111783981323, + "step": 9140 + }, + { + "ce_loss": 0.153269961476326, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "distill_loss": 0.1302245557308197, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "ref_ce_loss": 0.10670872777700424, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "loss": 0.5426515340805054, + "step": 9140 + }, + { + "ce_loss": 0.15486785769462585, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "distill_loss": 0.11330495774745941, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "ref_ce_loss": 0.08581661432981491, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "loss": 0.3043895959854126, + "step": 9140 + }, + { + "ce_loss": 0.0612226277589798, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "distill_loss": 0.11302270740270615, + "epoch": 3.04869913275517, + "step": 9140 + }, + { + "epoch": 3.04869913275517, + "ref_ce_loss": 0.1041092649102211, + "step": 9140 + }, + { + "epoch": 3.0520346897931954, + "loss": 0.4493, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "grad_norm": 2.173112630844116, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "learning_rate": 0.00021285782029332111, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "loss": 0.3466378450393677, + "step": 9150 + }, + { + "ce_loss": 0.1040438711643219, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "distill_loss": 0.10754573345184326, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "ref_ce_loss": 0.10663019120693207, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "loss": 0.7064090967178345, + "step": 9150 + }, + { + "ce_loss": 0.140250563621521, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "distill_loss": 0.10913405567407608, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "ref_ce_loss": 0.1309662163257599, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "loss": 0.5606474280357361, + "step": 9150 + }, + { + "ce_loss": 0.18404537439346313, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "distill_loss": 0.1043291687965393, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "ref_ce_loss": 0.11552339047193527, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "loss": 0.23970846831798553, + "step": 9150 + }, + { + "ce_loss": 0.03243853524327278, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "distill_loss": 0.10297881066799164, + "epoch": 3.0520346897931954, + "step": 9150 + }, + { + "epoch": 3.0520346897931954, + "ref_ce_loss": 0.060380127280950546, + "step": 9150 + }, + { + "epoch": 3.0553702468312207, + "loss": 0.5151, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "grad_norm": 2.2912018299102783, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "learning_rate": 0.00021267384492406415, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "loss": 0.3046942949295044, + "step": 9160 + }, + { + "ce_loss": 0.04315432533621788, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "distill_loss": 0.0770706906914711, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "ref_ce_loss": 0.07259001582860947, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "loss": 0.37191644310951233, + "step": 9160 + }, + { + "ce_loss": 0.14015623927116394, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "distill_loss": 0.10944913327693939, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "ref_ce_loss": 0.09285426884889603, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "loss": 0.8361793756484985, + "step": 9160 + }, + { + "ce_loss": 0.15691381692886353, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "distill_loss": 0.12314335256814957, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "ref_ce_loss": 0.09802524745464325, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "loss": 0.5051029324531555, + "step": 9160 + }, + { + "ce_loss": 0.16032618284225464, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "distill_loss": 0.1252148151397705, + "epoch": 3.0553702468312207, + "step": 9160 + }, + { + "epoch": 3.0553702468312207, + "ref_ce_loss": 0.09847740828990936, + "step": 9160 + }, + { + "epoch": 3.058705803869246, + "loss": 0.4546, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "grad_norm": 2.534843921661377, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "learning_rate": 0.00021248975526243682, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "loss": 0.29830026626586914, + "step": 9170 + }, + { + "ce_loss": 0.0924723893404007, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "distill_loss": 0.08625347167253494, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "ref_ce_loss": 0.09278205037117004, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "loss": 0.3502795398235321, + "step": 9170 + }, + { + "ce_loss": 0.06436974555253983, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "distill_loss": 0.09440024197101593, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "ref_ce_loss": 0.0780462846159935, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "loss": 0.29896295070648193, + "step": 9170 + }, + { + "ce_loss": 0.0671202540397644, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "distill_loss": 0.0891413614153862, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "ref_ce_loss": 0.09291604161262512, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "loss": 0.5191394090652466, + "step": 9170 + }, + { + "ce_loss": 0.15048867464065552, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "distill_loss": 0.10521610081195831, + "epoch": 3.058705803869246, + "step": 9170 + }, + { + "epoch": 3.058705803869246, + "ref_ce_loss": 0.09934370219707489, + "step": 9170 + }, + { + "epoch": 3.0620413609072714, + "loss": 0.4654, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "grad_norm": 2.8238677978515625, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "learning_rate": 0.00021230555164414614, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "loss": 0.5475714206695557, + "step": 9180 + }, + { + "ce_loss": 0.1436816155910492, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "distill_loss": 0.11422336101531982, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "ref_ce_loss": 0.1417398303747177, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "loss": 0.528938889503479, + "step": 9180 + }, + { + "ce_loss": 0.12694130837917328, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "distill_loss": 0.13421253859996796, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "ref_ce_loss": 0.11657682061195374, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "loss": 0.7762036323547363, + "step": 9180 + }, + { + "ce_loss": 0.1628248542547226, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "distill_loss": 0.10600240528583527, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "ref_ce_loss": 0.1570037305355072, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "loss": 0.35722458362579346, + "step": 9180 + }, + { + "ce_loss": 0.09392691403627396, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "distill_loss": 0.08912178874015808, + "epoch": 3.0620413609072714, + "step": 9180 + }, + { + "epoch": 3.0620413609072714, + "ref_ce_loss": 0.10203284025192261, + "step": 9180 + }, + { + "epoch": 3.0653769179452968, + "loss": 0.4791, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "grad_norm": 2.509390354156494, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "learning_rate": 0.00021212123440510683, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "loss": 0.4448990225791931, + "step": 9190 + }, + { + "ce_loss": 0.15895746648311615, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "distill_loss": 0.13571859896183014, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "ref_ce_loss": 0.10972491651773453, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "loss": 0.4973427653312683, + "step": 9190 + }, + { + "ce_loss": 0.09390971809625626, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "distill_loss": 0.1523730307817459, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "ref_ce_loss": 0.09280847012996674, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "loss": 0.30266839265823364, + "step": 9190 + }, + { + "ce_loss": 0.08505178987979889, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "distill_loss": 0.09608002007007599, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "ref_ce_loss": 0.0848335325717926, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "loss": 0.5422333478927612, + "step": 9190 + }, + { + "ce_loss": 0.2144017219543457, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "distill_loss": 0.16743631660938263, + "epoch": 3.0653769179452968, + "step": 9190 + }, + { + "epoch": 3.0653769179452968, + "ref_ce_loss": 0.07784318923950195, + "step": 9190 + }, + { + "epoch": 3.068712474983322, + "loss": 0.5275, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "grad_norm": 2.0295801162719727, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "learning_rate": 0.00021193680388144074, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "loss": 0.43971535563468933, + "step": 9200 + }, + { + "ce_loss": 0.10325153172016144, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "distill_loss": 0.12678496539592743, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "ref_ce_loss": 0.14589820802211761, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "loss": 0.38762301206588745, + "step": 9200 + }, + { + "ce_loss": 0.08085848391056061, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "distill_loss": 0.0924757868051529, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "ref_ce_loss": 0.11108756065368652, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "loss": 0.3827369213104248, + "step": 9200 + }, + { + "ce_loss": 0.06147749722003937, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "distill_loss": 0.15130433440208435, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "ref_ce_loss": 0.10998158156871796, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "loss": 0.36490562558174133, + "step": 9200 + }, + { + "ce_loss": 0.1082155779004097, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "distill_loss": 0.12540577352046967, + "epoch": 3.068712474983322, + "step": 9200 + }, + { + "epoch": 3.068712474983322, + "ref_ce_loss": 0.08237636089324951, + "step": 9200 + }, + { + "epoch": 3.0720480320213475, + "loss": 0.4654, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "grad_norm": 3.6549220085144043, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "learning_rate": 0.00021175226040947643, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "loss": 0.3767533004283905, + "step": 9210 + }, + { + "ce_loss": 0.030972249805927277, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "distill_loss": 0.10726246237754822, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "ref_ce_loss": 0.06324587017297745, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "loss": 0.35787636041641235, + "step": 9210 + }, + { + "ce_loss": 0.10263849794864655, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "distill_loss": 0.1269713193178177, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "ref_ce_loss": 0.09760292619466782, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "loss": 0.7159144878387451, + "step": 9210 + }, + { + "ce_loss": 0.19792456924915314, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "distill_loss": 0.1999223232269287, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "ref_ce_loss": 0.09326591342687607, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "loss": 0.45586901903152466, + "step": 9210 + }, + { + "ce_loss": 0.11067557334899902, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "distill_loss": 0.1134071871638298, + "epoch": 3.0720480320213475, + "step": 9210 + }, + { + "epoch": 3.0720480320213475, + "ref_ce_loss": 0.10379556566476822, + "step": 9210 + }, + { + "epoch": 3.075383589059373, + "loss": 0.4981, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "grad_norm": 3.2063794136047363, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "learning_rate": 0.00021156760432574845, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "loss": 0.6135134696960449, + "step": 9220 + }, + { + "ce_loss": 0.18632878363132477, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "distill_loss": 0.1343146711587906, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "ref_ce_loss": 0.12228333950042725, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "loss": 0.7215794324874878, + "step": 9220 + }, + { + "ce_loss": 0.17847248911857605, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "distill_loss": 0.14979347586631775, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "ref_ce_loss": 0.13431112468242645, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "loss": 0.5225598812103271, + "step": 9220 + }, + { + "ce_loss": 0.14933809638023376, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "distill_loss": 0.14387843012809753, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "ref_ce_loss": 0.09537240117788315, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "loss": 0.648240327835083, + "step": 9220 + }, + { + "ce_loss": 0.0898180678486824, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "distill_loss": 0.12337978929281235, + "epoch": 3.075383589059373, + "step": 9220 + }, + { + "epoch": 3.075383589059373, + "ref_ce_loss": 0.11949852854013443, + "step": 9220 + }, + { + "epoch": 3.078719146097398, + "loss": 0.5397, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "grad_norm": 4.953123569488525, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "learning_rate": 0.00021138283596699658, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "loss": 1.1860730648040771, + "step": 9230 + }, + { + "ce_loss": 0.2039407640695572, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "distill_loss": 0.11151131242513657, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "ref_ce_loss": 0.17183974385261536, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "loss": 0.3743239939212799, + "step": 9230 + }, + { + "ce_loss": 0.08552516996860504, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "distill_loss": 0.09078429639339447, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "ref_ce_loss": 0.0932949110865593, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "loss": 0.3618348240852356, + "step": 9230 + }, + { + "ce_loss": 0.09893414378166199, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "distill_loss": 0.101051926612854, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "ref_ce_loss": 0.1281239241361618, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "loss": 0.45348069071769714, + "step": 9230 + }, + { + "ce_loss": 0.15692560374736786, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "distill_loss": 0.1507617086172104, + "epoch": 3.078719146097398, + "step": 9230 + }, + { + "epoch": 3.078719146097398, + "ref_ce_loss": 0.0963934138417244, + "step": 9230 + }, + { + "epoch": 3.0820547031354235, + "loss": 0.5099, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "grad_norm": 6.172658920288086, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "learning_rate": 0.00021119795567016553, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "loss": 0.77073734998703, + "step": 9240 + }, + { + "ce_loss": 0.19863741099834442, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "distill_loss": 0.13241790235042572, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "ref_ce_loss": 0.10125939548015594, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "loss": 0.6127821207046509, + "step": 9240 + }, + { + "ce_loss": 0.19738160073757172, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "distill_loss": 0.13014522194862366, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "ref_ce_loss": 0.15752890706062317, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "loss": 0.5437523722648621, + "step": 9240 + }, + { + "ce_loss": 0.17684681713581085, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "distill_loss": 0.09984688460826874, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "ref_ce_loss": 0.08175192773342133, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "loss": 0.7698802351951599, + "step": 9240 + }, + { + "ce_loss": 0.10929053276777267, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "distill_loss": 0.0947088897228241, + "epoch": 3.0820547031354235, + "step": 9240 + }, + { + "epoch": 3.0820547031354235, + "ref_ce_loss": 0.10278258472681046, + "step": 9240 + }, + { + "epoch": 3.085390260173449, + "loss": 0.5153, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "grad_norm": 4.075594902038574, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "learning_rate": 0.00021101296377240388, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "loss": 0.3964616656303406, + "step": 9250 + }, + { + "ce_loss": 0.1403404325246811, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "distill_loss": 0.10999090224504471, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "ref_ce_loss": 0.09092675149440765, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "loss": 0.31418895721435547, + "step": 9250 + }, + { + "ce_loss": 0.12451020628213882, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "distill_loss": 0.104770727455616, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "ref_ce_loss": 0.06058081239461899, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "loss": 0.612764835357666, + "step": 9250 + }, + { + "ce_loss": 0.16425681114196777, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "distill_loss": 0.14076608419418335, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "ref_ce_loss": 0.14075103402137756, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "loss": 0.45888441801071167, + "step": 9250 + }, + { + "ce_loss": 0.16245479881763458, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "distill_loss": 0.13823184370994568, + "epoch": 3.085390260173449, + "step": 9250 + }, + { + "epoch": 3.085390260173449, + "ref_ce_loss": 0.11153128743171692, + "step": 9250 + }, + { + "epoch": 3.088725817211474, + "loss": 0.5267, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "grad_norm": 2.336992025375366, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "learning_rate": 0.00021082786061106401, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "loss": 0.24876004457473755, + "step": 9260 + }, + { + "ce_loss": 0.08521168678998947, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "distill_loss": 0.07871997356414795, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "ref_ce_loss": 0.08447657525539398, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "loss": 1.1503713130950928, + "step": 9260 + }, + { + "ce_loss": 0.17891433835029602, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "distill_loss": 0.12399543821811676, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "ref_ce_loss": 0.11212249100208282, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "loss": 0.5678211450576782, + "step": 9260 + }, + { + "ce_loss": 0.164375901222229, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "distill_loss": 0.12175918370485306, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "ref_ce_loss": 0.1290905773639679, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "loss": 0.5536719560623169, + "step": 9260 + }, + { + "ce_loss": 0.13819049298763275, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "distill_loss": 0.0980941578745842, + "epoch": 3.088725817211474, + "step": 9260 + }, + { + "epoch": 3.088725817211474, + "ref_ce_loss": 0.09547659754753113, + "step": 9260 + }, + { + "epoch": 3.0920613742494996, + "loss": 0.5244, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "grad_norm": 1.5601093769073486, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "learning_rate": 0.000210642646523701, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "loss": 0.4849652647972107, + "step": 9270 + }, + { + "ce_loss": 0.09189176559448242, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "distill_loss": 0.11284545809030533, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "ref_ce_loss": 0.12299380451440811, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "loss": 0.44110724329948425, + "step": 9270 + }, + { + "ce_loss": 0.15729226171970367, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "distill_loss": 0.11161144822835922, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "ref_ce_loss": 0.1129552498459816, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "loss": 0.38249096274375916, + "step": 9270 + }, + { + "ce_loss": 0.08966681361198425, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "distill_loss": 0.11741229146718979, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "ref_ce_loss": 0.07501877099275589, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "loss": 0.6309377551078796, + "step": 9270 + }, + { + "ce_loss": 0.1539163440465927, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "distill_loss": 0.1314060539007187, + "epoch": 3.0920613742494996, + "step": 9270 + }, + { + "epoch": 3.0920613742494996, + "ref_ce_loss": 0.12064765393733978, + "step": 9270 + }, + { + "epoch": 3.095396931287525, + "loss": 0.5212, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "grad_norm": 2.5884647369384766, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "learning_rate": 0.0002104573218480723, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "loss": 0.47474294900894165, + "step": 9280 + }, + { + "ce_loss": 0.15236707031726837, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "distill_loss": 0.07646772265434265, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "ref_ce_loss": 0.09920775145292282, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "loss": 0.4098622500896454, + "step": 9280 + }, + { + "ce_loss": 0.17156140506267548, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "distill_loss": 0.1252693384885788, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "ref_ce_loss": 0.11265676468610764, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "loss": 0.27114593982696533, + "step": 9280 + }, + { + "ce_loss": 0.10317492485046387, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "distill_loss": 0.08390937000513077, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "ref_ce_loss": 0.08384941518306732, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "loss": 0.4030131995677948, + "step": 9280 + }, + { + "ce_loss": 0.1477421671152115, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "distill_loss": 0.11889496445655823, + "epoch": 3.095396931287525, + "step": 9280 + }, + { + "epoch": 3.095396931287525, + "ref_ce_loss": 0.08689527958631516, + "step": 9280 + }, + { + "epoch": 3.0987324883255503, + "loss": 0.4677, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "grad_norm": 2.5747299194335938, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "learning_rate": 0.00021027188692213702, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "loss": 0.44657784700393677, + "step": 9290 + }, + { + "ce_loss": 0.06276731193065643, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "distill_loss": 0.10876451432704926, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "ref_ce_loss": 0.12008004635572433, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "loss": 0.4799278676509857, + "step": 9290 + }, + { + "ce_loss": 0.12068340927362442, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "distill_loss": 0.13862359523773193, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "ref_ce_loss": 0.09833888709545135, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "loss": 0.4970934987068176, + "step": 9290 + }, + { + "ce_loss": 0.20263740420341492, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "distill_loss": 0.14431583881378174, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "ref_ce_loss": 0.11357313394546509, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "loss": 0.5054774284362793, + "step": 9290 + }, + { + "ce_loss": 0.18168766796588898, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "distill_loss": 0.1163872703909874, + "epoch": 3.0987324883255503, + "step": 9290 + }, + { + "epoch": 3.0987324883255503, + "ref_ce_loss": 0.12051656097173691, + "step": 9290 + }, + { + "epoch": 3.1020680453635756, + "loss": 0.4978, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "grad_norm": 3.1117746829986572, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "learning_rate": 0.00021008634208405532, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "loss": 0.6433578133583069, + "step": 9300 + }, + { + "ce_loss": 0.17003703117370605, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "distill_loss": 0.18623663485050201, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "ref_ce_loss": 0.13352155685424805, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "loss": 0.7382215261459351, + "step": 9300 + }, + { + "ce_loss": 0.12075452506542206, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "distill_loss": 0.11009524762630463, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "ref_ce_loss": 0.08462850004434586, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "loss": 1.0106620788574219, + "step": 9300 + }, + { + "ce_loss": 0.19730034470558167, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "distill_loss": 0.19440525770187378, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "ref_ce_loss": 0.14143586158752441, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "loss": 0.2682120203971863, + "step": 9300 + }, + { + "ce_loss": 0.03231796994805336, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "distill_loss": 0.09826377779245377, + "epoch": 3.1020680453635756, + "step": 9300 + }, + { + "epoch": 3.1020680453635756, + "ref_ce_loss": 0.0769156664609909, + "step": 9300 + }, + { + "epoch": 3.105403602401601, + "loss": 0.4805, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "grad_norm": 2.6101269721984863, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "learning_rate": 0.00020990068767218778, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "loss": 0.6195972561836243, + "step": 9310 + }, + { + "ce_loss": 0.24588622152805328, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "distill_loss": 0.16623997688293457, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "ref_ce_loss": 0.144457146525383, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "loss": 0.3230529725551605, + "step": 9310 + }, + { + "ce_loss": 0.05365613102912903, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "distill_loss": 0.11842749267816544, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "ref_ce_loss": 0.09812147170305252, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "loss": 0.47844380140304565, + "step": 9310 + }, + { + "ce_loss": 0.20352646708488464, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "distill_loss": 0.16697512567043304, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "ref_ce_loss": 0.0778585821390152, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "loss": 0.5424474477767944, + "step": 9310 + }, + { + "ce_loss": 0.176710307598114, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "distill_loss": 0.11072219163179398, + "epoch": 3.105403602401601, + "step": 9310 + }, + { + "epoch": 3.105403602401601, + "ref_ce_loss": 0.11033220589160919, + "step": 9310 + }, + { + "epoch": 3.1087391594396263, + "loss": 0.5438, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "grad_norm": 2.295335292816162, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "learning_rate": 0.00020971492402509483, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "loss": 0.8378156423568726, + "step": 9320 + }, + { + "ce_loss": 0.16845764219760895, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "distill_loss": 0.16372105479240417, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "ref_ce_loss": 0.12829598784446716, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "loss": 0.463933527469635, + "step": 9320 + }, + { + "ce_loss": 0.1278490573167801, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "distill_loss": 0.12349837273359299, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "ref_ce_loss": 0.08968111127614975, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "loss": 0.4199450612068176, + "step": 9320 + }, + { + "ce_loss": 0.11417478322982788, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "distill_loss": 0.13870379328727722, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "ref_ce_loss": 0.11972949653863907, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "loss": 0.5914303064346313, + "step": 9320 + }, + { + "ce_loss": 0.14238835871219635, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "distill_loss": 0.12284432351589203, + "epoch": 3.1087391594396263, + "step": 9320 + }, + { + "epoch": 3.1087391594396263, + "ref_ce_loss": 0.07381831109523773, + "step": 9320 + }, + { + "epoch": 3.1120747164776517, + "loss": 0.547, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "grad_norm": 2.3080222606658936, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "learning_rate": 0.00020952905148153607, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "loss": 0.8455989360809326, + "step": 9330 + }, + { + "ce_loss": 0.17443938553333282, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "distill_loss": 0.10574007034301758, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "ref_ce_loss": 0.07656016945838928, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "loss": 0.6197810173034668, + "step": 9330 + }, + { + "ce_loss": 0.15716618299484253, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "distill_loss": 0.17074379324913025, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "ref_ce_loss": 0.16151253879070282, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "loss": 0.4124746322631836, + "step": 9330 + }, + { + "ce_loss": 0.1550775021314621, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "distill_loss": 0.12570025026798248, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "ref_ce_loss": 0.07098022103309631, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "loss": 0.8346481323242188, + "step": 9330 + }, + { + "ce_loss": 0.18216295540332794, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "distill_loss": 0.10690856724977493, + "epoch": 3.1120747164776517, + "step": 9330 + }, + { + "epoch": 3.1120747164776517, + "ref_ce_loss": 0.13296154141426086, + "step": 9330 + }, + { + "epoch": 3.115410273515677, + "loss": 0.4797, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "grad_norm": 2.3242478370666504, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "learning_rate": 0.00020934307038046965, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "loss": 0.3541427552700043, + "step": 9340 + }, + { + "ce_loss": 0.1066516563296318, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "distill_loss": 0.1119885966181755, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "ref_ce_loss": 0.08281800895929337, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "loss": 0.788700520992279, + "step": 9340 + }, + { + "ce_loss": 0.2846371829509735, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "distill_loss": 0.2575102746486664, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "ref_ce_loss": 0.1448318362236023, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "loss": 0.48121488094329834, + "step": 9340 + }, + { + "ce_loss": 0.1438976377248764, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "distill_loss": 0.17746815085411072, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "ref_ce_loss": 0.10531459003686905, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "loss": 0.30875542759895325, + "step": 9340 + }, + { + "ce_loss": 0.09192804992198944, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "distill_loss": 0.10150475800037384, + "epoch": 3.115410273515677, + "step": 9340 + }, + { + "epoch": 3.115410273515677, + "ref_ce_loss": 0.08724801987409592, + "step": 9340 + }, + { + "epoch": 3.1187458305537024, + "loss": 0.4828, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "grad_norm": 3.029604434967041, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "learning_rate": 0.00020915698106105187, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "loss": 0.46674320101737976, + "step": 9350 + }, + { + "ce_loss": 0.15375635027885437, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "distill_loss": 0.20978285372257233, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "ref_ce_loss": 0.10224690288305283, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "loss": 0.5229172706604004, + "step": 9350 + }, + { + "ce_loss": 0.16858038306236267, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "distill_loss": 0.21174737811088562, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "ref_ce_loss": 0.10061193257570267, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "loss": 0.7606104016304016, + "step": 9350 + }, + { + "ce_loss": 0.3095298707485199, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "distill_loss": 0.1703067272901535, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "ref_ce_loss": 0.2030133306980133, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "loss": 0.8234333992004395, + "step": 9350 + }, + { + "ce_loss": 0.09284783899784088, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "distill_loss": 0.19337543845176697, + "epoch": 3.1187458305537024, + "step": 9350 + }, + { + "epoch": 3.1187458305537024, + "ref_ce_loss": 0.07284106314182281, + "step": 9350 + }, + { + "epoch": 3.1220813875917277, + "loss": 0.5192, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "grad_norm": 2.5633559226989746, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "learning_rate": 0.00020897078386263615, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "loss": 0.4884679615497589, + "step": 9360 + }, + { + "ce_loss": 0.12888407707214355, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "distill_loss": 0.17425504326820374, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "ref_ce_loss": 0.1417858600616455, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "loss": 0.6288591623306274, + "step": 9360 + }, + { + "ce_loss": 0.10382901132106781, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "distill_loss": 0.204414501786232, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "ref_ce_loss": 0.12629245221614838, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "loss": 0.5769619941711426, + "step": 9360 + }, + { + "ce_loss": 0.2375664860010147, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "distill_loss": 0.16669347882270813, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "ref_ce_loss": 0.1284281313419342, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "loss": 0.5622938871383667, + "step": 9360 + }, + { + "ce_loss": 0.16985753178596497, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "distill_loss": 0.17187856137752533, + "epoch": 3.1220813875917277, + "step": 9360 + }, + { + "epoch": 3.1220813875917277, + "ref_ce_loss": 0.17148420214653015, + "step": 9360 + }, + { + "epoch": 3.125416944629753, + "loss": 0.4967, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "grad_norm": 2.569465398788452, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "learning_rate": 0.00020878447912477268, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "loss": 0.4813728332519531, + "step": 9370 + }, + { + "ce_loss": 0.2246464639902115, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "distill_loss": 0.167310893535614, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "ref_ce_loss": 0.08932902663946152, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "loss": 0.3730131983757019, + "step": 9370 + }, + { + "ce_loss": 0.09139455109834671, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "distill_loss": 0.1308048963546753, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "ref_ce_loss": 0.10961811244487762, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "loss": 0.3340955078601837, + "step": 9370 + }, + { + "ce_loss": 0.10592299699783325, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "distill_loss": 0.14246045053005219, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "ref_ce_loss": 0.0574227012693882, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "loss": 0.3670573830604553, + "step": 9370 + }, + { + "ce_loss": 0.1103769838809967, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "distill_loss": 0.12523096799850464, + "epoch": 3.125416944629753, + "step": 9370 + }, + { + "epoch": 3.125416944629753, + "ref_ce_loss": 0.1000145673751831, + "step": 9370 + }, + { + "epoch": 3.1287525016677784, + "loss": 0.4907, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "grad_norm": 2.5900824069976807, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "learning_rate": 0.00020859806718720792, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "loss": 0.6317574977874756, + "step": 9380 + }, + { + "ce_loss": 0.22041912376880646, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "distill_loss": 0.1646532118320465, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "ref_ce_loss": 0.14836913347244263, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "loss": 0.3367335796356201, + "step": 9380 + }, + { + "ce_loss": 0.11072410643100739, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "distill_loss": 0.10584492981433868, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "ref_ce_loss": 0.0841221809387207, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "loss": 0.5291550755500793, + "step": 9380 + }, + { + "ce_loss": 0.22186344861984253, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "distill_loss": 0.17727592587471008, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "ref_ce_loss": 0.12976141273975372, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "loss": 0.5323965549468994, + "step": 9380 + }, + { + "ce_loss": 0.1890687644481659, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "distill_loss": 0.16845601797103882, + "epoch": 3.1287525016677784, + "step": 9380 + }, + { + "epoch": 3.1287525016677784, + "ref_ce_loss": 0.12848173081874847, + "step": 9380 + }, + { + "epoch": 3.1320880587058038, + "loss": 0.4738, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "grad_norm": 2.4686079025268555, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "learning_rate": 0.00020841154838988364, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "loss": 0.6145331859588623, + "step": 9390 + }, + { + "ce_loss": 0.15522648394107819, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "distill_loss": 0.11161641031503677, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "ref_ce_loss": 0.09471061080694199, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "loss": 0.35698485374450684, + "step": 9390 + }, + { + "ce_loss": 0.07764547318220139, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "distill_loss": 0.14094972610473633, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "ref_ce_loss": 0.08621007204055786, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "loss": 0.289503812789917, + "step": 9390 + }, + { + "ce_loss": 0.09239063411951065, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "distill_loss": 0.08384876698255539, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "ref_ce_loss": 0.1131184846162796, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "loss": 0.3908340334892273, + "step": 9390 + }, + { + "ce_loss": 0.13656210899353027, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "distill_loss": 0.11637710779905319, + "epoch": 3.1320880587058038, + "step": 9390 + }, + { + "epoch": 3.1320880587058038, + "ref_ce_loss": 0.10149884223937988, + "step": 9390 + }, + { + "epoch": 3.135423615743829, + "loss": 0.5158, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "grad_norm": 5.387189865112305, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "learning_rate": 0.00020822492307293655, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "loss": 0.44152170419692993, + "step": 9400 + }, + { + "ce_loss": 0.0922570526599884, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "distill_loss": 0.0994657576084137, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "ref_ce_loss": 0.11067865043878555, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "loss": 0.5112382173538208, + "step": 9400 + }, + { + "ce_loss": 0.129240944981575, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "distill_loss": 0.13666221499443054, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "ref_ce_loss": 0.11640065908432007, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "loss": 0.5008998513221741, + "step": 9400 + }, + { + "ce_loss": 0.17135149240493774, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "distill_loss": 0.1431116908788681, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "ref_ce_loss": 0.06502916663885117, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "loss": 0.41493064165115356, + "step": 9400 + }, + { + "ce_loss": 0.11807221919298172, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "distill_loss": 0.1267719715833664, + "epoch": 3.135423615743829, + "step": 9400 + }, + { + "epoch": 3.135423615743829, + "ref_ce_loss": 0.07536394149065018, + "step": 9400 + }, + { + "epoch": 3.1387591727818545, + "loss": 0.4922, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "grad_norm": 2.6245408058166504, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "learning_rate": 0.00020803819157669766, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "loss": 0.36940526962280273, + "step": 9410 + }, + { + "ce_loss": 0.11200276762247086, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "distill_loss": 0.10597594082355499, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "ref_ce_loss": 0.11852702498435974, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "loss": 0.3383985459804535, + "step": 9410 + }, + { + "ce_loss": 0.09196282923221588, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "distill_loss": 0.10407501459121704, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "ref_ce_loss": 0.08363886177539825, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "loss": 0.6444665193557739, + "step": 9410 + }, + { + "ce_loss": 0.21152932941913605, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "distill_loss": 0.14611922204494476, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "ref_ce_loss": 0.12397867441177368, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "loss": 0.43682152032852173, + "step": 9410 + }, + { + "ce_loss": 0.09575681388378143, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "distill_loss": 0.11555679142475128, + "epoch": 3.1387591727818545, + "step": 9410 + }, + { + "epoch": 3.1387591727818545, + "ref_ce_loss": 0.11327240616083145, + "step": 9410 + }, + { + "epoch": 3.14209472981988, + "loss": 0.4972, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "grad_norm": 3.332326650619507, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "learning_rate": 0.00020785135424169156, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "loss": 0.5172271132469177, + "step": 9420 + }, + { + "ce_loss": 0.18631412088871002, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "distill_loss": 0.1371542513370514, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "ref_ce_loss": 0.10870325565338135, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "loss": 0.46155813336372375, + "step": 9420 + }, + { + "ce_loss": 0.1175021380186081, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "distill_loss": 0.1451730579137802, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "ref_ce_loss": 0.14567217230796814, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "loss": 0.7952618598937988, + "step": 9420 + }, + { + "ce_loss": 0.2216344028711319, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "distill_loss": 0.17677900195121765, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "ref_ce_loss": 0.15591177344322205, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "loss": 0.6818380355834961, + "step": 9420 + }, + { + "ce_loss": 0.21773341298103333, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "distill_loss": 0.1644251048564911, + "epoch": 3.14209472981988, + "step": 9420 + }, + { + "epoch": 3.14209472981988, + "ref_ce_loss": 0.14234201610088348, + "step": 9420 + }, + { + "epoch": 3.145430286857905, + "loss": 0.5497, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "grad_norm": 2.401385545730591, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "learning_rate": 0.00020766441140863577, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "loss": 0.393453449010849, + "step": 9430 + }, + { + "ce_loss": 0.13155467808246613, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "distill_loss": 0.10931574553251266, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "ref_ce_loss": 0.09655128419399261, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "loss": 0.5850750803947449, + "step": 9430 + }, + { + "ce_loss": 0.26182281970977783, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "distill_loss": 0.11857051402330399, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "ref_ce_loss": 0.14775000512599945, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "loss": 0.3602902293205261, + "step": 9430 + }, + { + "ce_loss": 0.12058570981025696, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "distill_loss": 0.10224181413650513, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "ref_ce_loss": 0.10732877999544144, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "loss": 0.34497690200805664, + "step": 9430 + }, + { + "ce_loss": 0.08291725069284439, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "distill_loss": 0.11977192759513855, + "epoch": 3.145430286857905, + "step": 9430 + }, + { + "epoch": 3.145430286857905, + "ref_ce_loss": 0.10097347944974899, + "step": 9430 + }, + { + "epoch": 3.1487658438959305, + "loss": 0.4861, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "grad_norm": 3.3197946548461914, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "learning_rate": 0.00020747736341844038, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "loss": 0.6189022660255432, + "step": 9440 + }, + { + "ce_loss": 0.26251354813575745, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "distill_loss": 0.22185054421424866, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "ref_ce_loss": 0.09539277106523514, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "loss": 0.388549268245697, + "step": 9440 + }, + { + "ce_loss": 0.09720093756914139, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "distill_loss": 0.10628783702850342, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "ref_ce_loss": 0.05633091554045677, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "loss": 0.26291322708129883, + "step": 9440 + }, + { + "ce_loss": 0.0834595263004303, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "distill_loss": 0.10990596562623978, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "ref_ce_loss": 0.06892497092485428, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "loss": 0.2386210411787033, + "step": 9440 + }, + { + "ce_loss": 0.044991619884967804, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "distill_loss": 0.08849681168794632, + "epoch": 3.1487658438959305, + "step": 9440 + }, + { + "epoch": 3.1487658438959305, + "ref_ce_loss": 0.10291331261396408, + "step": 9440 + }, + { + "epoch": 3.152101400933956, + "loss": 0.5312, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "grad_norm": 2.457263231277466, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "learning_rate": 0.000207290210612207, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "loss": 0.5044496655464172, + "step": 9450 + }, + { + "ce_loss": 0.19127508997917175, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "distill_loss": 0.14769670367240906, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "ref_ce_loss": 0.11158882081508636, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "loss": 0.8176451921463013, + "step": 9450 + }, + { + "ce_loss": 0.18827858567237854, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "distill_loss": 0.16407611966133118, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "ref_ce_loss": 0.14914929866790771, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "loss": 0.43922194838523865, + "step": 9450 + }, + { + "ce_loss": 0.1384783834218979, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "distill_loss": 0.15343838930130005, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "ref_ce_loss": 0.11001858860254288, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "loss": 0.3858475685119629, + "step": 9450 + }, + { + "ce_loss": 0.13132528960704803, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "distill_loss": 0.13699130713939667, + "epoch": 3.152101400933956, + "step": 9450 + }, + { + "epoch": 3.152101400933956, + "ref_ce_loss": 0.08675961196422577, + "step": 9450 + }, + { + "epoch": 3.155436957971981, + "loss": 0.5252, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "grad_norm": 3.537496566772461, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "learning_rate": 0.00020710295333122868, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "loss": 0.42468106746673584, + "step": 9460 + }, + { + "ce_loss": 0.13657568395137787, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "distill_loss": 0.1591964215040207, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "ref_ce_loss": 0.09413352608680725, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "loss": 0.48567768931388855, + "step": 9460 + }, + { + "ce_loss": 0.10176261514425278, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "distill_loss": 0.14666540920734406, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "ref_ce_loss": 0.12171636521816254, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "loss": 0.6425243020057678, + "step": 9460 + }, + { + "ce_loss": 0.1498386412858963, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "distill_loss": 0.2042589783668518, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "ref_ce_loss": 0.1274162232875824, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "loss": 0.7263787984848022, + "step": 9460 + }, + { + "ce_loss": 0.13975995779037476, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "distill_loss": 0.13692700862884521, + "epoch": 3.155436957971981, + "step": 9460 + }, + { + "epoch": 3.155436957971981, + "ref_ce_loss": 0.1329973340034485, + "step": 9460 + }, + { + "epoch": 3.1587725150100066, + "loss": 0.5612, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "grad_norm": 5.573383331298828, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "learning_rate": 0.00020691559191698876, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "loss": 0.3879528343677521, + "step": 9470 + }, + { + "ce_loss": 0.07907040417194366, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "distill_loss": 0.1571669578552246, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "ref_ce_loss": 0.09325411170721054, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "loss": 0.6910441517829895, + "step": 9470 + }, + { + "ce_loss": 0.2567923069000244, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "distill_loss": 0.19595922529697418, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "ref_ce_loss": 0.19764713943004608, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "loss": 0.8553934693336487, + "step": 9470 + }, + { + "ce_loss": 0.1372901350259781, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "distill_loss": 0.1819874346256256, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "ref_ce_loss": 0.1226576417684555, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "loss": 0.8925281763076782, + "step": 9470 + }, + { + "ce_loss": 0.16941511631011963, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "distill_loss": 0.2515280246734619, + "epoch": 3.1587725150100066, + "step": 9470 + }, + { + "epoch": 3.1587725150100066, + "ref_ce_loss": 0.14782162010669708, + "step": 9470 + }, + { + "epoch": 3.162108072048032, + "loss": 0.5487, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "grad_norm": 4.598330497741699, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "learning_rate": 0.00020672812671116052, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "loss": 0.3818383812904358, + "step": 9480 + }, + { + "ce_loss": 0.08812081813812256, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "distill_loss": 0.1844634860754013, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "ref_ce_loss": 0.10911556333303452, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "loss": 0.4501780569553375, + "step": 9480 + }, + { + "ce_loss": 0.11660466343164444, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "distill_loss": 0.15039286017417908, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "ref_ce_loss": 0.10421738773584366, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "loss": 0.5638201832771301, + "step": 9480 + }, + { + "ce_loss": 0.1827458292245865, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "distill_loss": 0.19522276520729065, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "ref_ce_loss": 0.11500189453363419, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "loss": 0.48389768600463867, + "step": 9480 + }, + { + "ce_loss": 0.1433190554380417, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "distill_loss": 0.21797630190849304, + "epoch": 3.162108072048032, + "step": 9480 + }, + { + "epoch": 3.162108072048032, + "ref_ce_loss": 0.12215530872344971, + "step": 9480 + }, + { + "epoch": 3.1654436290860573, + "loss": 0.6044, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "grad_norm": 3.7871646881103516, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "learning_rate": 0.00020654055805560662, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "loss": 0.5622644424438477, + "step": 9490 + }, + { + "ce_loss": 0.15414637327194214, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "distill_loss": 0.2756098508834839, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "ref_ce_loss": 0.10112477093935013, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "loss": 0.37916234135627747, + "step": 9490 + }, + { + "ce_loss": 0.07440236955881119, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "distill_loss": 0.17885206639766693, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "ref_ce_loss": 0.06276611983776093, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "loss": 1.0593150854110718, + "step": 9490 + }, + { + "ce_loss": 0.1475900262594223, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "distill_loss": 0.297240287065506, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "ref_ce_loss": 0.1502453237771988, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "loss": 0.5550252199172974, + "step": 9490 + }, + { + "ce_loss": 0.14874206483364105, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "distill_loss": 0.2950950860977173, + "epoch": 3.1654436290860573, + "step": 9490 + }, + { + "epoch": 3.1654436290860573, + "ref_ce_loss": 0.08617687225341797, + "step": 9490 + }, + { + "epoch": 3.1687791861240826, + "loss": 0.666, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "grad_norm": 6.840047836303711, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "learning_rate": 0.0002063528862923782, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "loss": 0.6700571775436401, + "step": 9500 + }, + { + "ce_loss": 0.2572788894176483, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "distill_loss": 0.2567271888256073, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "ref_ce_loss": 0.15579195320606232, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "loss": 0.9040383100509644, + "step": 9500 + }, + { + "ce_loss": 0.20419129729270935, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "distill_loss": 0.21273010969161987, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "ref_ce_loss": 0.11534770578145981, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "loss": 0.5888009667396545, + "step": 9500 + }, + { + "ce_loss": 0.10383474081754684, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "distill_loss": 0.2927468419075012, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "ref_ce_loss": 0.10847216844558716, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "loss": 0.44755592942237854, + "step": 9500 + }, + { + "ce_loss": 0.14601008594036102, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "distill_loss": 0.15512534976005554, + "epoch": 3.1687791861240826, + "step": 9500 + }, + { + "epoch": 3.1687791861240826, + "ref_ce_loss": 0.07985249161720276, + "step": 9500 + }, + { + "epoch": 3.172114743162108, + "loss": 0.6205, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "grad_norm": 2.793665885925293, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "learning_rate": 0.00020616511176371465, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "loss": 0.45676806569099426, + "step": 9510 + }, + { + "ce_loss": 0.14598795771598816, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "distill_loss": 0.16061095893383026, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "ref_ce_loss": 0.09271222352981567, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "loss": 0.5553157925605774, + "step": 9510 + }, + { + "ce_loss": 0.24441476166248322, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "distill_loss": 0.1846240609884262, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "ref_ce_loss": 0.12624230980873108, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "loss": 0.4163009524345398, + "step": 9510 + }, + { + "ce_loss": 0.11610870063304901, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "distill_loss": 0.16040027141571045, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "ref_ce_loss": 0.10182558000087738, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "loss": 0.3117366135120392, + "step": 9510 + }, + { + "ce_loss": 0.07078323513269424, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "distill_loss": 0.13040763139724731, + "epoch": 3.172114743162108, + "step": 9510 + }, + { + "epoch": 3.172114743162108, + "ref_ce_loss": 0.11036781966686249, + "step": 9510 + }, + { + "epoch": 3.1754503002001333, + "loss": 0.5296, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "grad_norm": 4.296565532684326, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "learning_rate": 0.00020597723481204251, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "loss": 0.3121442496776581, + "step": 9520 + }, + { + "ce_loss": 0.07688009738922119, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "distill_loss": 0.12842987477779388, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "ref_ce_loss": 0.10675547271966934, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "loss": 0.4808747172355652, + "step": 9520 + }, + { + "ce_loss": 0.07221264392137527, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "distill_loss": 0.15099433064460754, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "ref_ce_loss": 0.08741383999586105, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "loss": 0.28258243203163147, + "step": 9520 + }, + { + "ce_loss": 0.07468129694461823, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "distill_loss": 0.1378626972436905, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "ref_ce_loss": 0.06976884603500366, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "loss": 0.469711035490036, + "step": 9520 + }, + { + "ce_loss": 0.11323860287666321, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "distill_loss": 0.20424960553646088, + "epoch": 3.1754503002001333, + "step": 9520 + }, + { + "epoch": 3.1754503002001333, + "ref_ce_loss": 0.06783854216337204, + "step": 9520 + }, + { + "epoch": 3.1787858572381587, + "loss": 0.5476, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "grad_norm": 1.974441647529602, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "learning_rate": 0.0002057892557799753, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "loss": 0.41514265537261963, + "step": 9530 + }, + { + "ce_loss": 0.12711583077907562, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "distill_loss": 0.17615363001823425, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "ref_ce_loss": 0.07456578314304352, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "loss": 0.6223483085632324, + "step": 9530 + }, + { + "ce_loss": 0.16231295466423035, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "distill_loss": 0.1323533058166504, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "ref_ce_loss": 0.11479992419481277, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "loss": 0.4665870666503906, + "step": 9530 + }, + { + "ce_loss": 0.12044032663106918, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "distill_loss": 0.20676037669181824, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "ref_ce_loss": 0.09432920068502426, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "loss": 0.41781315207481384, + "step": 9530 + }, + { + "ce_loss": 0.15532222390174866, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "distill_loss": 0.1355230212211609, + "epoch": 3.1787858572381587, + "step": 9530 + }, + { + "epoch": 3.1787858572381587, + "ref_ce_loss": 0.1266229748725891, + "step": 9530 + }, + { + "epoch": 3.182121414276184, + "loss": 0.5524, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "grad_norm": 2.466517448425293, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "learning_rate": 0.00020560117501031264, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "loss": 0.3828505873680115, + "step": 9540 + }, + { + "ce_loss": 0.10816628485918045, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "distill_loss": 0.09501266479492188, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "ref_ce_loss": 0.14963635802268982, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "loss": 0.6822697520256042, + "step": 9540 + }, + { + "ce_loss": 0.0815955400466919, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "distill_loss": 0.17200466990470886, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "ref_ce_loss": 0.12888874113559723, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "loss": 0.428188681602478, + "step": 9540 + }, + { + "ce_loss": 0.12251435965299606, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "distill_loss": 0.1265813708305359, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "ref_ce_loss": 0.08265712857246399, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "loss": 0.4096594452857971, + "step": 9540 + }, + { + "ce_loss": 0.09690508246421814, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "distill_loss": 0.10855381190776825, + "epoch": 3.182121414276184, + "step": 9540 + }, + { + "epoch": 3.182121414276184, + "ref_ce_loss": 0.07601267099380493, + "step": 9540 + }, + { + "epoch": 3.1854569713142094, + "loss": 0.5364, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "grad_norm": 2.382157325744629, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "learning_rate": 0.0002054129928460396, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "loss": 0.38213956356048584, + "step": 9550 + }, + { + "ce_loss": 0.09391985088586807, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "distill_loss": 0.12190929055213928, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "ref_ce_loss": 0.09163673967123032, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "loss": 0.4600287675857544, + "step": 9550 + }, + { + "ce_loss": 0.12160974740982056, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "distill_loss": 0.13835075497627258, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "ref_ce_loss": 0.09320228546857834, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "loss": 0.5476783514022827, + "step": 9550 + }, + { + "ce_loss": 0.24290725588798523, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "distill_loss": 0.15814319252967834, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "ref_ce_loss": 0.12850640714168549, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "loss": 0.47217875719070435, + "step": 9550 + }, + { + "ce_loss": 0.09855984896421432, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "distill_loss": 0.13362620770931244, + "epoch": 3.1854569713142094, + "step": 9550 + }, + { + "epoch": 3.1854569713142094, + "ref_ce_loss": 0.07858073711395264, + "step": 9550 + }, + { + "epoch": 3.1887925283522347, + "loss": 0.5146, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "grad_norm": 3.2701833248138428, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "learning_rate": 0.0002052247096303263, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "loss": 0.2855941355228424, + "step": 9560 + }, + { + "ce_loss": 0.06373196840286255, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "distill_loss": 0.13350118696689606, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "ref_ce_loss": 0.06919163465499878, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "loss": 0.48115861415863037, + "step": 9560 + }, + { + "ce_loss": 0.11704720556735992, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "distill_loss": 0.1717700958251953, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "ref_ce_loss": 0.11801206320524216, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "loss": 0.4057348072528839, + "step": 9560 + }, + { + "ce_loss": 0.10528801381587982, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "distill_loss": 0.18464790284633636, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "ref_ce_loss": 0.11558081954717636, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "loss": 0.5976885557174683, + "step": 9560 + }, + { + "ce_loss": 0.1965349018573761, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "distill_loss": 0.11449624598026276, + "epoch": 3.1887925283522347, + "step": 9560 + }, + { + "epoch": 3.1887925283522347, + "ref_ce_loss": 0.18658675253391266, + "step": 9560 + }, + { + "epoch": 3.19212808539026, + "loss": 0.5078, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "grad_norm": 2.857909679412842, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "learning_rate": 0.00020503632570652693, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "loss": 0.319354385137558, + "step": 9570 + }, + { + "ce_loss": 0.09820310026407242, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "distill_loss": 0.13627631962299347, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "ref_ce_loss": 0.08482971042394638, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "loss": 0.5038487315177917, + "step": 9570 + }, + { + "ce_loss": 0.13817772269248962, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "distill_loss": 0.1949910670518875, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "ref_ce_loss": 0.11488980799913406, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "loss": 0.6758288145065308, + "step": 9570 + }, + { + "ce_loss": 0.1828175038099289, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "distill_loss": 0.19590409100055695, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "ref_ce_loss": 0.11350443959236145, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "loss": 0.5674213171005249, + "step": 9570 + }, + { + "ce_loss": 0.08833687007427216, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "distill_loss": 0.13266253471374512, + "epoch": 3.19212808539026, + "step": 9570 + }, + { + "epoch": 3.19212808539026, + "ref_ce_loss": 0.07736072689294815, + "step": 9570 + }, + { + "epoch": 3.1954636424282854, + "loss": 0.516, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "grad_norm": 5.583287239074707, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "learning_rate": 0.00020484784141817957, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "loss": 0.8101859092712402, + "step": 9580 + }, + { + "ce_loss": 0.18054203689098358, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "distill_loss": 0.11705302447080612, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "ref_ce_loss": 0.10733237117528915, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "loss": 0.3281974792480469, + "step": 9580 + }, + { + "ce_loss": 0.13033422827720642, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "distill_loss": 0.1337510496377945, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "ref_ce_loss": 0.06400664150714874, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "loss": 0.5542563199996948, + "step": 9580 + }, + { + "ce_loss": 0.1291143149137497, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "distill_loss": 0.18237747251987457, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "ref_ce_loss": 0.09707945585250854, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "loss": 0.46752509474754333, + "step": 9580 + }, + { + "ce_loss": 0.16086234152317047, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "distill_loss": 0.12830647826194763, + "epoch": 3.1954636424282854, + "step": 9580 + }, + { + "epoch": 3.1954636424282854, + "ref_ce_loss": 0.11367620527744293, + "step": 9580 + }, + { + "epoch": 3.1987991994663107, + "loss": 0.4776, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "grad_norm": 2.494795322418213, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "learning_rate": 0.00020465925710900517, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "loss": 0.4189956784248352, + "step": 9590 + }, + { + "ce_loss": 0.1456405073404312, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "distill_loss": 0.129704087972641, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "ref_ce_loss": 0.09875006228685379, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "loss": 0.6570210456848145, + "step": 9590 + }, + { + "ce_loss": 0.2106141895055771, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "distill_loss": 0.13541418313980103, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "ref_ce_loss": 0.09310900419950485, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "loss": 0.5128756761550903, + "step": 9590 + }, + { + "ce_loss": 0.05535023659467697, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "distill_loss": 0.10842591524124146, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "ref_ce_loss": 0.07918021082878113, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "loss": 0.435626745223999, + "step": 9590 + }, + { + "ce_loss": 0.13782809674739838, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "distill_loss": 0.15453578531742096, + "epoch": 3.1987991994663107, + "step": 9590 + }, + { + "epoch": 3.1987991994663107, + "ref_ce_loss": 0.10330259054899216, + "step": 9590 + }, + { + "epoch": 3.202134756504336, + "loss": 0.5214, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "grad_norm": 3.278040647506714, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "learning_rate": 0.00020447057312290715, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "loss": 0.417108952999115, + "step": 9600 + }, + { + "ce_loss": 0.11677819490432739, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "distill_loss": 0.12888827919960022, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "ref_ce_loss": 0.12421637773513794, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "loss": 0.7868222594261169, + "step": 9600 + }, + { + "ce_loss": 0.1702321469783783, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "distill_loss": 0.15147097408771515, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "ref_ce_loss": 0.1157556101679802, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "loss": 0.3623127043247223, + "step": 9600 + }, + { + "ce_loss": 0.06962898373603821, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "distill_loss": 0.13172480463981628, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "ref_ce_loss": 0.11244726926088333, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "loss": 0.579292893409729, + "step": 9600 + }, + { + "ce_loss": 0.2325315922498703, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "distill_loss": 0.21294154226779938, + "epoch": 3.202134756504336, + "step": 9600 + }, + { + "epoch": 3.202134756504336, + "ref_ce_loss": 0.08337957412004471, + "step": 9600 + }, + { + "epoch": 3.2054703135423614, + "loss": 0.4992, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "grad_norm": 3.2729134559631348, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "learning_rate": 0.00020428178980397063, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "loss": 0.5233567357063293, + "step": 9610 + }, + { + "ce_loss": 0.14610396325588226, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "distill_loss": 0.20599067211151123, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "ref_ce_loss": 0.07143604010343552, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "loss": 0.3326717019081116, + "step": 9610 + }, + { + "ce_loss": 0.09717848896980286, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "distill_loss": 0.12722012400627136, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "ref_ce_loss": 0.10740362107753754, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "loss": 0.5446369051933289, + "step": 9610 + }, + { + "ce_loss": 0.21039313077926636, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "distill_loss": 0.13953128457069397, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "ref_ce_loss": 0.10978644341230392, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "loss": 0.48363035917282104, + "step": 9610 + }, + { + "ce_loss": 0.13355810940265656, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "distill_loss": 0.1513618379831314, + "epoch": 3.2054703135423614, + "step": 9610 + }, + { + "epoch": 3.2054703135423614, + "ref_ce_loss": 0.1118159294128418, + "step": 9610 + }, + { + "epoch": 3.208805870580387, + "loss": 0.5102, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "grad_norm": 2.630423069000244, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "learning_rate": 0.00020409290749646189, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "loss": 0.4511738717556, + "step": 9620 + }, + { + "ce_loss": 0.12089888751506805, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "distill_loss": 0.12530340254306793, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "ref_ce_loss": 0.176653191447258, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "loss": 0.45738130807876587, + "step": 9620 + }, + { + "ce_loss": 0.14809496700763702, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "distill_loss": 0.10620304197072983, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "ref_ce_loss": 0.06848637014627457, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "loss": 0.5911251306533813, + "step": 9620 + }, + { + "ce_loss": 0.07842303812503815, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "distill_loss": 0.13734924793243408, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "ref_ce_loss": 0.14106030762195587, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "loss": 0.44962990283966064, + "step": 9620 + }, + { + "ce_loss": 0.06224361062049866, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "distill_loss": 0.12899431586265564, + "epoch": 3.208805870580387, + "step": 9620 + }, + { + "epoch": 3.208805870580387, + "ref_ce_loss": 0.0823230966925621, + "step": 9620 + }, + { + "epoch": 3.212141427618412, + "loss": 0.4656, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "grad_norm": 2.494377374649048, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "learning_rate": 0.00020390392654482783, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "loss": 0.33894914388656616, + "step": 9630 + }, + { + "ce_loss": 0.11409568786621094, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "distill_loss": 0.11827443540096283, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "ref_ce_loss": 0.058073848485946655, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "loss": 0.3645583987236023, + "step": 9630 + }, + { + "ce_loss": 0.09342974424362183, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "distill_loss": 0.13384076952934265, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "ref_ce_loss": 0.10339362919330597, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "loss": 0.5319348573684692, + "step": 9630 + }, + { + "ce_loss": 0.21150082349777222, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "distill_loss": 0.14189140498638153, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "ref_ce_loss": 0.13533258438110352, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "loss": 0.48543843626976013, + "step": 9630 + }, + { + "ce_loss": 0.19601376354694366, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "distill_loss": 0.13757900893688202, + "epoch": 3.212141427618412, + "step": 9630 + }, + { + "epoch": 3.212141427618412, + "ref_ce_loss": 0.12655571103096008, + "step": 9630 + }, + { + "epoch": 3.2154769846564375, + "loss": 0.5622, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "grad_norm": 4.793918609619141, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "learning_rate": 0.0002037148472936951, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "loss": 0.3617390990257263, + "step": 9640 + }, + { + "ce_loss": 0.1141476035118103, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "distill_loss": 0.13533206284046173, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "ref_ce_loss": 0.08762861788272858, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "loss": 0.423265278339386, + "step": 9640 + }, + { + "ce_loss": 0.1403900384902954, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "distill_loss": 0.15556564927101135, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "ref_ce_loss": 0.10428271442651749, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "loss": 0.5071457624435425, + "step": 9640 + }, + { + "ce_loss": 0.1454550176858902, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "distill_loss": 0.10869236290454865, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "ref_ce_loss": 0.11991751939058304, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "loss": 0.746184229850769, + "step": 9640 + }, + { + "ce_loss": 0.1692987084388733, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "distill_loss": 0.20635975897312164, + "epoch": 3.2154769846564375, + "step": 9640 + }, + { + "epoch": 3.2154769846564375, + "ref_ce_loss": 0.09694461524486542, + "step": 9640 + }, + { + "epoch": 3.218812541694463, + "loss": 1.5343, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "grad_norm": 8.480672836303711, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "learning_rate": 0.00020352567008786963, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "loss": 1.3023204803466797, + "step": 9650 + }, + { + "ce_loss": 0.7586283683776855, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "distill_loss": 0.1492748260498047, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "ref_ce_loss": 0.3516393005847931, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "loss": 1.6434491872787476, + "step": 9650 + }, + { + "ce_loss": 0.7541458606719971, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "distill_loss": 0.15649262070655823, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "ref_ce_loss": 0.3518027663230896, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "loss": 1.1772658824920654, + "step": 9650 + }, + { + "ce_loss": 0.6369430422782898, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "distill_loss": 0.12943366169929504, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "ref_ce_loss": 0.3679516613483429, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "loss": 1.3905925750732422, + "step": 9650 + }, + { + "ce_loss": 0.726006805896759, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "distill_loss": 0.11813858151435852, + "epoch": 3.218812541694463, + "step": 9650 + }, + { + "epoch": 3.218812541694463, + "ref_ce_loss": 0.3810167610645294, + "step": 9650 + }, + { + "epoch": 3.222148098732488, + "loss": 1.0659, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "grad_norm": 3.7545816898345947, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "learning_rate": 0.00020333639527233616, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "loss": 0.5680769681930542, + "step": 9660 + }, + { + "ce_loss": 0.20624542236328125, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "distill_loss": 0.14041753113269806, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "ref_ce_loss": 0.18326659500598907, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "loss": 1.0041627883911133, + "step": 9660 + }, + { + "ce_loss": 0.4418887794017792, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "distill_loss": 0.13282467424869537, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "ref_ce_loss": 0.37674033641815186, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "loss": 0.7607677578926086, + "step": 9660 + }, + { + "ce_loss": 0.35817694664001465, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "distill_loss": 0.1649259328842163, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "ref_ce_loss": 0.1993914395570755, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "loss": 0.6379758715629578, + "step": 9660 + }, + { + "ce_loss": 0.29930704832077026, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "distill_loss": 0.1762618124485016, + "epoch": 3.222148098732488, + "step": 9660 + }, + { + "epoch": 3.222148098732488, + "ref_ce_loss": 0.1622111052274704, + "step": 9660 + }, + { + "epoch": 3.2254836557705135, + "loss": 0.659, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "grad_norm": 27.17189598083496, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "learning_rate": 0.00020314702319225718, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "loss": 1.4548907279968262, + "step": 9670 + }, + { + "ce_loss": 0.35011687874794006, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "distill_loss": 0.9352494478225708, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "ref_ce_loss": 0.11367769539356232, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "loss": 1.437563419342041, + "step": 9670 + }, + { + "ce_loss": 0.15209540724754333, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "distill_loss": 0.9998146295547485, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "ref_ce_loss": 0.14491644501686096, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "loss": 2.1415417194366455, + "step": 9670 + }, + { + "ce_loss": 0.24623355269432068, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "distill_loss": 1.5178892612457275, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "ref_ce_loss": 0.202957421541214, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "loss": 0.9889248013496399, + "step": 9670 + }, + { + "ce_loss": 0.09803508222103119, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "distill_loss": 0.8088805675506592, + "epoch": 3.2254836557705135, + "step": 9670 + }, + { + "epoch": 3.2254836557705135, + "ref_ce_loss": 0.05574851855635643, + "step": 9670 + }, + { + "epoch": 3.228819212808539, + "loss": 1.0588, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "grad_norm": 3.260193109512329, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "learning_rate": 0.00020295755419297268, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "loss": 0.6862177848815918, + "step": 9680 + }, + { + "ce_loss": 0.17582151293754578, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "distill_loss": 0.3368160128593445, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "ref_ce_loss": 0.08550892025232315, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "loss": 0.3851708173751831, + "step": 9680 + }, + { + "ce_loss": 0.05822121724486351, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "distill_loss": 0.23647257685661316, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "ref_ce_loss": 0.0903349369764328, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "loss": 0.6628296971321106, + "step": 9680 + }, + { + "ce_loss": 0.0833716094493866, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "distill_loss": 0.3014061152935028, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "ref_ce_loss": 0.1067894697189331, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "loss": 0.5852959156036377, + "step": 9680 + }, + { + "ce_loss": 0.07522204518318176, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "distill_loss": 0.3575586974620819, + "epoch": 3.228819212808539, + "step": 9680 + }, + { + "epoch": 3.228819212808539, + "ref_ce_loss": 0.09939462691545486, + "step": 9680 + }, + { + "epoch": 3.2321547698465642, + "loss": 0.6606, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "grad_norm": 2.6169354915618896, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "learning_rate": 0.00020276798861999933, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "loss": 0.5074576139450073, + "step": 9690 + }, + { + "ce_loss": 0.15143181383609772, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "distill_loss": 0.2413361668586731, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "ref_ce_loss": 0.07629888504743576, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "loss": 0.5918530225753784, + "step": 9690 + }, + { + "ce_loss": 0.2147100418806076, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "distill_loss": 0.17524051666259766, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "ref_ce_loss": 0.1480754017829895, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "loss": 0.37897729873657227, + "step": 9690 + }, + { + "ce_loss": 0.09335962682962418, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "distill_loss": 0.19734136760234833, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "ref_ce_loss": 0.08819133788347244, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "loss": 0.685653805732727, + "step": 9690 + }, + { + "ce_loss": 0.13283292949199677, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "distill_loss": 0.16413123905658722, + "epoch": 3.2321547698465642, + "step": 9690 + }, + { + "epoch": 3.2321547698465642, + "ref_ce_loss": 0.10335833579301834, + "step": 9690 + }, + { + "epoch": 3.2354903268845896, + "loss": 0.4984, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "grad_norm": 3.0020861625671387, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "learning_rate": 0.00020257832681903012, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "loss": 0.25732314586639404, + "step": 9700 + }, + { + "ce_loss": 0.032223112881183624, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "distill_loss": 0.14216585457324982, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "ref_ce_loss": 0.054947543889284134, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "loss": 0.4035998582839966, + "step": 9700 + }, + { + "ce_loss": 0.09584304690361023, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "distill_loss": 0.16314128041267395, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "ref_ce_loss": 0.08428196609020233, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "loss": 0.7436294555664062, + "step": 9700 + }, + { + "ce_loss": 0.18889939785003662, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "distill_loss": 0.21565823256969452, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "ref_ce_loss": 0.13887543976306915, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "loss": 0.4003385007381439, + "step": 9700 + }, + { + "ce_loss": 0.06409730017185211, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "distill_loss": 0.189022034406662, + "epoch": 3.2354903268845896, + "step": 9700 + }, + { + "epoch": 3.2354903268845896, + "ref_ce_loss": 0.08388952910900116, + "step": 9700 + }, + { + "epoch": 3.238825883922615, + "loss": 0.581, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "grad_norm": 2.807310104370117, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "learning_rate": 0.00020238856913593317, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "loss": 0.45934754610061646, + "step": 9710 + }, + { + "ce_loss": 0.1265133172273636, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "distill_loss": 0.12370441854000092, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "ref_ce_loss": 0.09366489946842194, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "loss": 0.5969701409339905, + "step": 9710 + }, + { + "ce_loss": 0.1662624329328537, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "distill_loss": 0.16972975432872772, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "ref_ce_loss": 0.11409129947423935, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "loss": 0.5954180955886841, + "step": 9710 + }, + { + "ce_loss": 0.18566739559173584, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "distill_loss": 0.16116562485694885, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "ref_ce_loss": 0.12154942750930786, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "loss": 0.4547787308692932, + "step": 9710 + }, + { + "ce_loss": 0.08679163455963135, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "distill_loss": 0.1623559296131134, + "epoch": 3.238825883922615, + "step": 9710 + }, + { + "epoch": 3.238825883922615, + "ref_ce_loss": 0.1384986937046051, + "step": 9710 + }, + { + "epoch": 3.2421614409606403, + "loss": 0.5901, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "grad_norm": 8.585079193115234, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "learning_rate": 0.00020219871591675172, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "loss": 0.48637694120407104, + "step": 9720 + }, + { + "ce_loss": 0.15971849858760834, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "distill_loss": 0.14762723445892334, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "ref_ce_loss": 0.12237177044153214, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "loss": 0.5249758362770081, + "step": 9720 + }, + { + "ce_loss": 0.21139027178287506, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "distill_loss": 0.17246796190738678, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "ref_ce_loss": 0.14065127074718475, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "loss": 0.4310125410556793, + "step": 9720 + }, + { + "ce_loss": 0.13576455414295197, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "distill_loss": 0.12842974066734314, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "ref_ce_loss": 0.10069247335195541, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "loss": 0.45626258850097656, + "step": 9720 + }, + { + "ce_loss": 0.16060946881771088, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "distill_loss": 0.13746333122253418, + "epoch": 3.2421614409606403, + "step": 9720 + }, + { + "epoch": 3.2421614409606403, + "ref_ce_loss": 0.13184267282485962, + "step": 9720 + }, + { + "epoch": 3.2454969979986656, + "loss": 0.5579, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "grad_norm": 4.468499660491943, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "learning_rate": 0.00020200876750770317, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "loss": 0.5182029008865356, + "step": 9730 + }, + { + "ce_loss": 0.15194390714168549, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "distill_loss": 0.2397410273551941, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "ref_ce_loss": 0.1261671483516693, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "loss": 0.40942132472991943, + "step": 9730 + }, + { + "ce_loss": 0.11343543231487274, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "distill_loss": 0.17565378546714783, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "ref_ce_loss": 0.12022323161363602, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "loss": 0.4131905436515808, + "step": 9730 + }, + { + "ce_loss": 0.09293036162853241, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "distill_loss": 0.16029858589172363, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "ref_ce_loss": 0.10926628857851028, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "loss": 0.543439507484436, + "step": 9730 + }, + { + "ce_loss": 0.1307881474494934, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "distill_loss": 0.15304943919181824, + "epoch": 3.2454969979986656, + "step": 9730 + }, + { + "epoch": 3.2454969979986656, + "ref_ce_loss": 0.13484518229961395, + "step": 9730 + }, + { + "epoch": 3.248832555036691, + "loss": 0.5438, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "grad_norm": 2.117421865463257, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "learning_rate": 0.00020181872425517847, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "loss": 0.4847257435321808, + "step": 9740 + }, + { + "ce_loss": 0.11899697780609131, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "distill_loss": 0.14587488770484924, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "ref_ce_loss": 0.14244896173477173, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "loss": 0.46945518255233765, + "step": 9740 + }, + { + "ce_loss": 0.15519380569458008, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "distill_loss": 0.19748863577842712, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "ref_ce_loss": 0.11662587523460388, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "loss": 0.7112184166908264, + "step": 9740 + }, + { + "ce_loss": 0.09962423145771027, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "distill_loss": 0.18776388466358185, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "ref_ce_loss": 0.11562529951334, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "loss": 0.5923649668693542, + "step": 9740 + }, + { + "ce_loss": 0.189992755651474, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "distill_loss": 0.1979469507932663, + "epoch": 3.248832555036691, + "step": 9740 + }, + { + "epoch": 3.248832555036691, + "ref_ce_loss": 0.11803510040044785, + "step": 9740 + }, + { + "epoch": 3.2521681120747163, + "loss": 0.5334, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "grad_norm": 5.560446262359619, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "learning_rate": 0.00020162858650574154, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "loss": 0.5909914970397949, + "step": 9750 + }, + { + "ce_loss": 0.23804140090942383, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "distill_loss": 0.16925199329853058, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "ref_ce_loss": 0.14933234453201294, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "loss": 0.5917761325836182, + "step": 9750 + }, + { + "ce_loss": 0.09722870588302612, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "distill_loss": 0.13730478286743164, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "ref_ce_loss": 0.05942942947149277, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "loss": 0.6330657005310059, + "step": 9750 + }, + { + "ce_loss": 0.1761714220046997, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "distill_loss": 0.1895550787448883, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "ref_ce_loss": 0.1777772605419159, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "loss": 0.5256844758987427, + "step": 9750 + }, + { + "ce_loss": 0.13771918416023254, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "distill_loss": 0.14787374436855316, + "epoch": 3.2521681120747163, + "step": 9750 + }, + { + "epoch": 3.2521681120747163, + "ref_ce_loss": 0.12790906429290771, + "step": 9750 + }, + { + "epoch": 3.2555036691127417, + "loss": 0.521, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "grad_norm": 2.0890445709228516, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "learning_rate": 0.00020143835460612866, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "loss": 0.42880985140800476, + "step": 9760 + }, + { + "ce_loss": 0.16630008816719055, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "distill_loss": 0.1700364351272583, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "ref_ce_loss": 0.07479525357484818, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "loss": 0.5276511907577515, + "step": 9760 + }, + { + "ce_loss": 0.19932356476783752, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "distill_loss": 0.1725931465625763, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "ref_ce_loss": 0.1009768545627594, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "loss": 0.4974745213985443, + "step": 9760 + }, + { + "ce_loss": 0.18790408968925476, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "distill_loss": 0.1412593126296997, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "ref_ce_loss": 0.0973147377371788, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "loss": 0.48675134778022766, + "step": 9760 + }, + { + "ce_loss": 0.0493842251598835, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "distill_loss": 0.11071799695491791, + "epoch": 3.2555036691127417, + "step": 9760 + }, + { + "epoch": 3.2555036691127417, + "ref_ce_loss": 0.09657184034585953, + "step": 9760 + }, + { + "epoch": 3.258839226150767, + "loss": 0.5075, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "grad_norm": 1.9114854335784912, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "learning_rate": 0.00020124802890324775, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "loss": 0.4194925129413605, + "step": 9770 + }, + { + "ce_loss": 0.14611773192882538, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "distill_loss": 0.1504100263118744, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "ref_ce_loss": 0.09022071212530136, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "loss": 0.5268236398696899, + "step": 9770 + }, + { + "ce_loss": 0.12277179956436157, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "distill_loss": 0.14569061994552612, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "ref_ce_loss": 0.10679484158754349, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "loss": 0.4402240514755249, + "step": 9770 + }, + { + "ce_loss": 0.12379911541938782, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "distill_loss": 0.11681478470563889, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "ref_ce_loss": 0.14763450622558594, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "loss": 0.278000146150589, + "step": 9770 + }, + { + "ce_loss": 0.08359132707118988, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "distill_loss": 0.0681973546743393, + "epoch": 3.258839226150767, + "step": 9770 + }, + { + "epoch": 3.258839226150767, + "ref_ce_loss": 0.08435453474521637, + "step": 9770 + }, + { + "epoch": 3.2621747831887924, + "loss": 0.4993, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "grad_norm": 2.544344186782837, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "learning_rate": 0.0002010576097441778, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "loss": 0.8057675957679749, + "step": 9780 + }, + { + "ce_loss": 0.13802434504032135, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "distill_loss": 0.16143648326396942, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "ref_ce_loss": 0.0895475447177887, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "loss": 0.5339944958686829, + "step": 9780 + }, + { + "ce_loss": 0.1788073182106018, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "distill_loss": 0.16007177531719208, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "ref_ce_loss": 0.09300737082958221, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "loss": 0.6089990735054016, + "step": 9780 + }, + { + "ce_loss": 0.2176850438117981, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "distill_loss": 0.22681301832199097, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "ref_ce_loss": 0.1334032416343689, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "loss": 0.2944650948047638, + "step": 9780 + }, + { + "ce_loss": 0.08381827920675278, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "distill_loss": 0.12243427336215973, + "epoch": 3.2621747831887924, + "step": 9780 + }, + { + "epoch": 3.2621747831887924, + "ref_ce_loss": 0.08806728571653366, + "step": 9780 + }, + { + "epoch": 3.2655103402268177, + "loss": 0.5072, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "grad_norm": 2.3298983573913574, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "learning_rate": 0.00020086709747616822, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "loss": 0.381562739610672, + "step": 9790 + }, + { + "ce_loss": 0.10704471915960312, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "distill_loss": 0.08754559606313705, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "ref_ce_loss": 0.10674937814474106, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "loss": 0.4779171943664551, + "step": 9790 + }, + { + "ce_loss": 0.1513449251651764, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "distill_loss": 0.12535059452056885, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "ref_ce_loss": 0.1194036528468132, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "loss": 0.6823694705963135, + "step": 9790 + }, + { + "ce_loss": 0.20080308616161346, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "distill_loss": 0.18392544984817505, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "ref_ce_loss": 0.09276615083217621, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "loss": 0.4755535125732422, + "step": 9790 + }, + { + "ce_loss": 0.14225377142429352, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "distill_loss": 0.13601207733154297, + "epoch": 3.2655103402268177, + "step": 9790 + }, + { + "epoch": 3.2655103402268177, + "ref_ce_loss": 0.14101681113243103, + "step": 9790 + }, + { + "epoch": 3.268845897264843, + "loss": 0.4772, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "grad_norm": 2.873114824295044, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "learning_rate": 0.00020067649244663837, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "loss": 0.47671616077423096, + "step": 9800 + }, + { + "ce_loss": 0.18779507279396057, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "distill_loss": 0.15565767884254456, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "ref_ce_loss": 0.13307105004787445, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "loss": 0.6724848747253418, + "step": 9800 + }, + { + "ce_loss": 0.12847858667373657, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "distill_loss": 0.11508485674858093, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "ref_ce_loss": 0.1192900612950325, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "loss": 0.5285953283309937, + "step": 9800 + }, + { + "ce_loss": 0.14411479234695435, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "distill_loss": 0.19417299330234528, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "ref_ce_loss": 0.1415955275297165, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "loss": 0.7303289175033569, + "step": 9800 + }, + { + "ce_loss": 0.11429604887962341, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "distill_loss": 0.12689504027366638, + "epoch": 3.268845897264843, + "step": 9800 + }, + { + "epoch": 3.268845897264843, + "ref_ce_loss": 0.07350821793079376, + "step": 9800 + }, + { + "epoch": 3.2721814543028684, + "loss": 0.4966, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "grad_norm": 3.44111967086792, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "learning_rate": 0.00020048579500317652, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "loss": 0.6512364149093628, + "step": 9810 + }, + { + "ce_loss": 0.24342629313468933, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "distill_loss": 0.1871444135904312, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "ref_ce_loss": 0.16401995718479156, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "loss": 0.3500097393989563, + "step": 9810 + }, + { + "ce_loss": 0.032036811113357544, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "distill_loss": 0.08574137091636658, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "ref_ce_loss": 0.06518110632896423, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "loss": 1.2171872854232788, + "step": 9810 + }, + { + "ce_loss": 0.1737753450870514, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "distill_loss": 0.184116393327713, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "ref_ce_loss": 0.1184069886803627, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "loss": 0.5101798176765442, + "step": 9810 + }, + { + "ce_loss": 0.14508378505706787, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "distill_loss": 0.16374604403972626, + "epoch": 3.2721814543028684, + "step": 9810 + }, + { + "epoch": 3.2721814543028684, + "ref_ce_loss": 0.13258105516433716, + "step": 9810 + }, + { + "epoch": 3.275517011340894, + "loss": 0.5242, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "grad_norm": 3.690037965774536, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "learning_rate": 0.00020029500549353953, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "loss": 0.3060483932495117, + "step": 9820 + }, + { + "ce_loss": 0.09589601308107376, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "distill_loss": 0.12110260128974915, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "ref_ce_loss": 0.06255815178155899, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "loss": 0.421102911233902, + "step": 9820 + }, + { + "ce_loss": 0.13635626435279846, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "distill_loss": 0.1552823930978775, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "ref_ce_loss": 0.08633653819561005, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "loss": 0.7562495470046997, + "step": 9820 + }, + { + "ce_loss": 0.1636921614408493, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "distill_loss": 0.15046286582946777, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "ref_ce_loss": 0.13611376285552979, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "loss": 0.46156540513038635, + "step": 9820 + }, + { + "ce_loss": 0.1379977911710739, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "distill_loss": 0.13064102828502655, + "epoch": 3.275517011340894, + "step": 9820 + }, + { + "epoch": 3.275517011340894, + "ref_ce_loss": 0.12863753736019135, + "step": 9820 + }, + { + "epoch": 3.278852568378919, + "loss": 0.4644, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "grad_norm": 2.5951879024505615, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "learning_rate": 0.00020010412426565231, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "loss": 0.9946610927581787, + "step": 9830 + }, + { + "ce_loss": 0.21470636129379272, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "distill_loss": 0.21564292907714844, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "ref_ce_loss": 0.12243160605430603, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "loss": 0.48204684257507324, + "step": 9830 + }, + { + "ce_loss": 0.12645453214645386, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "distill_loss": 0.1417766958475113, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "ref_ce_loss": 0.05860704556107521, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "loss": 0.2505921423435211, + "step": 9830 + }, + { + "ce_loss": 0.050950322300195694, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "distill_loss": 0.10858387500047684, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "ref_ce_loss": 0.09072738885879517, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "loss": 0.520016610622406, + "step": 9830 + }, + { + "ce_loss": 0.2073366940021515, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "distill_loss": 0.19920435547828674, + "epoch": 3.278852568378919, + "step": 9830 + }, + { + "epoch": 3.278852568378919, + "ref_ce_loss": 0.11311342567205429, + "step": 9830 + }, + { + "epoch": 3.2821881254169445, + "loss": 0.4845, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "grad_norm": 3.4132983684539795, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "learning_rate": 0.00019991315166760696, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "loss": 0.4670141935348511, + "step": 9840 + }, + { + "ce_loss": 0.1389661729335785, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "distill_loss": 0.11367867887020111, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "ref_ce_loss": 0.12627971172332764, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "loss": 0.5269565582275391, + "step": 9840 + }, + { + "ce_loss": 0.08351260423660278, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "distill_loss": 0.11990583688020706, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "ref_ce_loss": 0.13506436347961426, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "loss": 0.2908315360546112, + "step": 9840 + }, + { + "ce_loss": 0.10760679095983505, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "distill_loss": 0.08831951022148132, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "ref_ce_loss": 0.07872118055820465, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "loss": 0.4684480130672455, + "step": 9840 + }, + { + "ce_loss": 0.17640560865402222, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "distill_loss": 0.15796883404254913, + "epoch": 3.2821881254169445, + "step": 9840 + }, + { + "epoch": 3.2821881254169445, + "ref_ce_loss": 0.13395830988883972, + "step": 9840 + }, + { + "epoch": 3.28552368245497, + "loss": 0.4738, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "grad_norm": 3.823991060256958, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "learning_rate": 0.00019972208804766204, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "loss": 0.5192458629608154, + "step": 9850 + }, + { + "ce_loss": 0.13076889514923096, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "distill_loss": 0.12664109468460083, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "ref_ce_loss": 0.1299402117729187, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "loss": 0.5129473805427551, + "step": 9850 + }, + { + "ce_loss": 0.18821565806865692, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "distill_loss": 0.11755567044019699, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "ref_ce_loss": 0.11735755950212479, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "loss": 0.4829626679420471, + "step": 9850 + }, + { + "ce_loss": 0.20330511033535004, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "distill_loss": 0.11206808686256409, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "ref_ce_loss": 0.0962306559085846, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "loss": 0.23460951447486877, + "step": 9850 + }, + { + "ce_loss": 0.03618684411048889, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "distill_loss": 0.08662353456020355, + "epoch": 3.28552368245497, + "step": 9850 + }, + { + "epoch": 3.28552368245497, + "ref_ce_loss": 0.06193290650844574, + "step": 9850 + }, + { + "epoch": 3.288859239492995, + "loss": 0.5143, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "grad_norm": 2.249880790710449, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "learning_rate": 0.0001995309337542423, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "loss": 0.38961324095726013, + "step": 9860 + }, + { + "ce_loss": 0.1265280395746231, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "distill_loss": 0.11052341014146805, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "ref_ce_loss": 0.08913455903530121, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "loss": 0.40968209505081177, + "step": 9860 + }, + { + "ce_loss": 0.15417298674583435, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "distill_loss": 0.10991737991571426, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "ref_ce_loss": 0.145408034324646, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "loss": 0.34837135672569275, + "step": 9860 + }, + { + "ce_loss": 0.07209538668394089, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "distill_loss": 0.12966960668563843, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "ref_ce_loss": 0.14647279679775238, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "loss": 0.32055962085723877, + "step": 9860 + }, + { + "ce_loss": 0.1121489554643631, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "distill_loss": 0.1139790490269661, + "epoch": 3.288859239492995, + "step": 9860 + }, + { + "epoch": 3.288859239492995, + "ref_ce_loss": 0.09429316222667694, + "step": 9860 + }, + { + "epoch": 3.2921947965310205, + "loss": 0.462, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "grad_norm": 3.066953659057617, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "learning_rate": 0.00019933968913593775, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "loss": 0.4638405442237854, + "step": 9870 + }, + { + "ce_loss": 0.15076488256454468, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "distill_loss": 0.13203120231628418, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "ref_ce_loss": 0.08420390635728836, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "loss": 0.32437434792518616, + "step": 9870 + }, + { + "ce_loss": 0.10534469038248062, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "distill_loss": 0.10000304877758026, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "ref_ce_loss": 0.05723772197961807, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "loss": 0.6644257307052612, + "step": 9870 + }, + { + "ce_loss": 0.2317466139793396, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "distill_loss": 0.14958073198795319, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "ref_ce_loss": 0.12686073780059814, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "loss": 0.41358280181884766, + "step": 9870 + }, + { + "ce_loss": 0.1507405787706375, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "distill_loss": 0.10844182223081589, + "epoch": 3.2921947965310205, + "step": 9870 + }, + { + "epoch": 3.2921947965310205, + "ref_ce_loss": 0.0918106660246849, + "step": 9870 + }, + { + "epoch": 3.295530353569046, + "loss": 0.5121, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "grad_norm": 3.1873927116394043, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "learning_rate": 0.0001991483545415031, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "loss": 0.44472411274909973, + "step": 9880 + }, + { + "ce_loss": 0.1372465342283249, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "distill_loss": 0.17125433683395386, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "ref_ce_loss": 0.10206897556781769, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "loss": 0.3743453025817871, + "step": 9880 + }, + { + "ce_loss": 0.11211249977350235, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "distill_loss": 0.17139093577861786, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "ref_ce_loss": 0.09072692692279816, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "loss": 0.5743526220321655, + "step": 9880 + }, + { + "ce_loss": 0.1388760209083557, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "distill_loss": 0.1605054885149002, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "ref_ce_loss": 0.1317940354347229, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "loss": 0.25721773505210876, + "step": 9880 + }, + { + "ce_loss": 0.06072324141860008, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "distill_loss": 0.12532374262809753, + "epoch": 3.295530353569046, + "step": 9880 + }, + { + "epoch": 3.295530353569046, + "ref_ce_loss": 0.051129937171936035, + "step": 9880 + }, + { + "epoch": 3.2988659106070712, + "loss": 0.4627, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "grad_norm": 3.4960994720458984, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "learning_rate": 0.0001989569303198573, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "loss": 0.47169768810272217, + "step": 9890 + }, + { + "ce_loss": 0.22074246406555176, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "distill_loss": 0.1348043829202652, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "ref_ce_loss": 0.09197013825178146, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "loss": 0.303199827671051, + "step": 9890 + }, + { + "ce_loss": 0.08557787537574768, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "distill_loss": 0.1385389268398285, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "ref_ce_loss": 0.07887089997529984, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "loss": 0.38248252868652344, + "step": 9890 + }, + { + "ce_loss": 0.1412307769060135, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "distill_loss": 0.14288009703159332, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "ref_ce_loss": 0.09742052853107452, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "loss": 0.5217670798301697, + "step": 9890 + }, + { + "ce_loss": 0.19139555096626282, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "distill_loss": 0.12419295310974121, + "epoch": 3.2988659106070712, + "step": 9890 + }, + { + "epoch": 3.2988659106070712, + "ref_ce_loss": 0.11238839477300644, + "step": 9890 + }, + { + "epoch": 3.3022014676450966, + "loss": 0.4834, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "grad_norm": 2.676337480545044, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "learning_rate": 0.00019876541682008246, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "loss": 1.0787899494171143, + "step": 9900 + }, + { + "ce_loss": 0.1795887053012848, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "distill_loss": 0.13583481311798096, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "ref_ce_loss": 0.12809765338897705, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "loss": 0.5525591969490051, + "step": 9900 + }, + { + "ce_loss": 0.13453686237335205, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "distill_loss": 0.11752891540527344, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "ref_ce_loss": 0.15759889781475067, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "loss": 0.3806399703025818, + "step": 9900 + }, + { + "ce_loss": 0.10052482038736343, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "distill_loss": 0.09455669671297073, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "ref_ce_loss": 0.134662926197052, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "loss": 0.30280801653862, + "step": 9900 + }, + { + "ce_loss": 0.09678518772125244, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "distill_loss": 0.10427294671535492, + "epoch": 3.3022014676450966, + "step": 9900 + }, + { + "epoch": 3.3022014676450966, + "ref_ce_loss": 0.10156451910734177, + "step": 9900 + }, + { + "epoch": 3.305537024683122, + "loss": 0.5029, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "grad_norm": 4.027454853057861, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "learning_rate": 0.00019857381439142372, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "loss": 0.25459831953048706, + "step": 9910 + }, + { + "ce_loss": 0.042915359139442444, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "distill_loss": 0.10543102025985718, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "ref_ce_loss": 0.10604436695575714, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "loss": 0.3459298014640808, + "step": 9910 + }, + { + "ce_loss": 0.09528271108865738, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "distill_loss": 0.11957503855228424, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "ref_ce_loss": 0.10496371984481812, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "loss": 0.3889404237270355, + "step": 9910 + }, + { + "ce_loss": 0.1402936428785324, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "distill_loss": 0.1201651319861412, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "ref_ce_loss": 0.12824715673923492, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "loss": 0.44783639907836914, + "step": 9910 + }, + { + "ce_loss": 0.1377970278263092, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "distill_loss": 0.11447380483150482, + "epoch": 3.305537024683122, + "step": 9910 + }, + { + "epoch": 3.305537024683122, + "ref_ce_loss": 0.12642312049865723, + "step": 9910 + }, + { + "epoch": 3.3088725817211473, + "loss": 0.4543, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "grad_norm": 2.3650686740875244, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "learning_rate": 0.00019838212338328838, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "loss": 0.9150041341781616, + "step": 9920 + }, + { + "ce_loss": 0.16532965004444122, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "distill_loss": 0.12562328577041626, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "ref_ce_loss": 0.12355418503284454, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "loss": 0.5890618562698364, + "step": 9920 + }, + { + "ce_loss": 0.22781842947006226, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "distill_loss": 0.12290843576192856, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "ref_ce_loss": 0.16453175246715546, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "loss": 0.5935525894165039, + "step": 9920 + }, + { + "ce_loss": 0.12414514273405075, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "distill_loss": 0.10505258291959763, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "ref_ce_loss": 0.12029238045215607, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "loss": 0.5247844457626343, + "step": 9920 + }, + { + "ce_loss": 0.11055473983287811, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "distill_loss": 0.1137736439704895, + "epoch": 3.3088725817211473, + "step": 9920 + }, + { + "epoch": 3.3088725817211473, + "ref_ce_loss": 0.11973520368337631, + "step": 9920 + }, + { + "epoch": 3.3122081387591726, + "loss": 0.4588, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "grad_norm": 2.262359857559204, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "learning_rate": 0.00019819034414524515, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "loss": 0.5808669328689575, + "step": 9930 + }, + { + "ce_loss": 0.19623792171478271, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "distill_loss": 0.1292586475610733, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "ref_ce_loss": 0.11712811887264252, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "loss": 0.3369356393814087, + "step": 9930 + }, + { + "ce_loss": 0.10894513875246048, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "distill_loss": 0.09358334541320801, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "ref_ce_loss": 0.1341428905725479, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "loss": 0.6350229382514954, + "step": 9930 + }, + { + "ce_loss": 0.27962931990623474, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "distill_loss": 0.15435360372066498, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "ref_ce_loss": 0.1478508561849594, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "loss": 0.34648460149765015, + "step": 9930 + }, + { + "ce_loss": 0.13289031386375427, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "distill_loss": 0.10292407125234604, + "epoch": 3.3122081387591726, + "step": 9930 + }, + { + "epoch": 3.3122081387591726, + "ref_ce_loss": 0.1104697436094284, + "step": 9930 + }, + { + "epoch": 3.315543695797198, + "loss": 0.5132, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "grad_norm": 3.6616158485412598, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "learning_rate": 0.00019799847702702377, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "loss": 0.4875798523426056, + "step": 9940 + }, + { + "ce_loss": 0.1997719556093216, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "distill_loss": 0.15560133755207062, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "ref_ce_loss": 0.09026752412319183, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "loss": 0.3376682698726654, + "step": 9940 + }, + { + "ce_loss": 0.0443369522690773, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "distill_loss": 0.10496130585670471, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "ref_ce_loss": 0.08619407564401627, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "loss": 0.4448798894882202, + "step": 9940 + }, + { + "ce_loss": 0.154523566365242, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "distill_loss": 0.15318924188613892, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "ref_ce_loss": 0.13680867850780487, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "loss": 0.5253775119781494, + "step": 9940 + }, + { + "ce_loss": 0.2364787459373474, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "distill_loss": 0.1384231299161911, + "epoch": 3.315543695797198, + "step": 9940 + }, + { + "epoch": 3.315543695797198, + "ref_ce_loss": 0.1246381625533104, + "step": 9940 + }, + { + "epoch": 3.3188792528352233, + "loss": 0.4866, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "grad_norm": 7.7432332038879395, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "learning_rate": 0.00019780652237851414, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "loss": 0.40983110666275024, + "step": 9950 + }, + { + "ce_loss": 0.12707103788852692, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "distill_loss": 0.13503378629684448, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "ref_ce_loss": 0.11911485344171524, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "loss": 0.5177261233329773, + "step": 9950 + }, + { + "ce_loss": 0.16839143633842468, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "distill_loss": 0.1610061228275299, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "ref_ce_loss": 0.1314130425453186, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "loss": 0.6280122995376587, + "step": 9950 + }, + { + "ce_loss": 0.15752331912517548, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "distill_loss": 0.15175409615039825, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "ref_ce_loss": 0.1513548046350479, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "loss": 0.37333211302757263, + "step": 9950 + }, + { + "ce_loss": 0.14288701117038727, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "distill_loss": 0.11070699244737625, + "epoch": 3.3188792528352233, + "step": 9950 + }, + { + "epoch": 3.3188792528352233, + "ref_ce_loss": 0.11954483389854431, + "step": 9950 + }, + { + "epoch": 3.3222148098732487, + "loss": 0.4921, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "grad_norm": 2.99534010887146, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "learning_rate": 0.00019761448054976573, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "loss": 0.74758380651474, + "step": 9960 + }, + { + "ce_loss": 0.13936404883861542, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "distill_loss": 0.10002149641513824, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "ref_ce_loss": 0.11220888048410416, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "loss": 1.3047645092010498, + "step": 9960 + }, + { + "ce_loss": 0.1143212616443634, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "distill_loss": 0.12109558284282684, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "ref_ce_loss": 0.1020015999674797, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "loss": 0.42227429151535034, + "step": 9960 + }, + { + "ce_loss": 0.0887669026851654, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "distill_loss": 0.14309155941009521, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "ref_ce_loss": 0.11570673435926437, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "loss": 0.349815309047699, + "step": 9960 + }, + { + "ce_loss": 0.08138938993215561, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "distill_loss": 0.11505304276943207, + "epoch": 3.3222148098732487, + "step": 9960 + }, + { + "epoch": 3.3222148098732487, + "ref_ce_loss": 0.11736801266670227, + "step": 9960 + }, + { + "epoch": 3.325550366911274, + "loss": 0.5193, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "grad_norm": 6.900662422180176, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "learning_rate": 0.0001974223518909873, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "loss": 0.652093231678009, + "step": 9970 + }, + { + "ce_loss": 0.28630322217941284, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "distill_loss": 0.23069880902767181, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "ref_ce_loss": 0.13491539657115936, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "loss": 0.5203249454498291, + "step": 9970 + }, + { + "ce_loss": 0.0266736950725317, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "distill_loss": 0.08863398432731628, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "ref_ce_loss": 0.10284887999296188, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "loss": 0.3818473815917969, + "step": 9970 + }, + { + "ce_loss": 0.12291661649942398, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "distill_loss": 0.11008165031671524, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "ref_ce_loss": 0.12098938971757889, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "loss": 0.29974690079689026, + "step": 9970 + }, + { + "ce_loss": 0.1182885468006134, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "distill_loss": 0.10031406581401825, + "epoch": 3.325550366911274, + "step": 9970 + }, + { + "epoch": 3.325550366911274, + "ref_ce_loss": 0.0809178575873375, + "step": 9970 + }, + { + "epoch": 3.3288859239492994, + "loss": 0.4931, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "grad_norm": 3.05298113822937, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "learning_rate": 0.00019723013675254557, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "loss": 0.33277106285095215, + "step": 9980 + }, + { + "ce_loss": 0.09598188102245331, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "distill_loss": 0.10369281470775604, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "ref_ce_loss": 0.10251549631357193, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "loss": 0.3862980604171753, + "step": 9980 + }, + { + "ce_loss": 0.15732648968696594, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "distill_loss": 0.09723378717899323, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "ref_ce_loss": 0.13168202340602875, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "loss": 0.3247963786125183, + "step": 9980 + }, + { + "ce_loss": 0.13272802531719208, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "distill_loss": 0.11012163758277893, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "ref_ce_loss": 0.0818270593881607, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "loss": 0.3600651025772095, + "step": 9980 + }, + { + "ce_loss": 0.12932313978672028, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "distill_loss": 0.1150616928935051, + "epoch": 3.3288859239492994, + "step": 9980 + }, + { + "epoch": 3.3288859239492994, + "ref_ce_loss": 0.08246026933193207, + "step": 9980 + }, + { + "epoch": 3.3322214809873247, + "loss": 0.464, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "grad_norm": 3.1215741634368896, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "learning_rate": 0.00019703783548496515, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "loss": 0.41121190786361694, + "step": 9990 + }, + { + "ce_loss": 0.1616680771112442, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "distill_loss": 0.12587764859199524, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "ref_ce_loss": 0.09707213193178177, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "loss": 0.6120696067810059, + "step": 9990 + }, + { + "ce_loss": 0.1784650683403015, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "distill_loss": 0.11934734135866165, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "ref_ce_loss": 0.10499797761440277, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "loss": 0.6155219078063965, + "step": 9990 + }, + { + "ce_loss": 0.12345847487449646, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "distill_loss": 0.10840432345867157, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "ref_ce_loss": 0.13200382888317108, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "loss": 0.3213757276535034, + "step": 9990 + }, + { + "ce_loss": 0.049462899565696716, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "distill_loss": 0.06645163893699646, + "epoch": 3.3322214809873247, + "step": 9990 + }, + { + "epoch": 3.3322214809873247, + "ref_ce_loss": 0.07780586183071136, + "step": 9990 + }, + { + "epoch": 3.33555703802535, + "loss": 0.4702, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "grad_norm": 2.5109341144561768, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "learning_rate": 0.00019684544843892772, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "loss": 0.384892076253891, + "step": 10000 + }, + { + "ce_loss": 0.12277370691299438, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "distill_loss": 0.11243809014558792, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "ref_ce_loss": 0.10022447258234024, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "loss": 0.31973639130592346, + "step": 10000 + }, + { + "ce_loss": 0.11735039204359055, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "distill_loss": 0.12508799135684967, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "ref_ce_loss": 0.07725123316049576, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "loss": 0.30221205949783325, + "step": 10000 + }, + { + "ce_loss": 0.09164729714393616, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "distill_loss": 0.07754302769899368, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "ref_ce_loss": 0.10495035350322723, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "loss": 0.5154871940612793, + "step": 10000 + }, + { + "ce_loss": 0.1592252552509308, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "distill_loss": 0.12933585047721863, + "epoch": 3.33555703802535, + "step": 10000 + }, + { + "epoch": 3.33555703802535, + "ref_ce_loss": 0.13211289048194885, + "step": 10000 + }, + { + "epoch": 3.3388925950633754, + "loss": 0.5013, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "grad_norm": 2.0777783393859863, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "learning_rate": 0.0001966529759652714, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "loss": 1.3123657703399658, + "step": 10010 + }, + { + "ce_loss": 0.20345760881900787, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "distill_loss": 0.13194897770881653, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "ref_ce_loss": 0.09893735498189926, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "loss": 0.5096614360809326, + "step": 10010 + }, + { + "ce_loss": 0.13056215643882751, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "distill_loss": 0.09500642120838165, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "ref_ce_loss": 0.11117087304592133, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "loss": 0.45446115732192993, + "step": 10010 + }, + { + "ce_loss": 0.1598779857158661, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "distill_loss": 0.10420762747526169, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "ref_ce_loss": 0.11016450077295303, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "loss": 0.5303270816802979, + "step": 10010 + }, + { + "ce_loss": 0.14446771144866943, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "distill_loss": 0.1430281698703766, + "epoch": 3.3388925950633754, + "step": 10010 + }, + { + "epoch": 3.3388925950633754, + "ref_ce_loss": 0.13635635375976562, + "step": 10010 + }, + { + "epoch": 3.342228152101401, + "loss": 0.521, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "grad_norm": 3.594378709793091, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "learning_rate": 0.00019646041841499, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "loss": 0.538306474685669, + "step": 10020 + }, + { + "ce_loss": 0.11759919673204422, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "distill_loss": 0.08925510942935944, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "ref_ce_loss": 0.10188145190477371, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "loss": 0.37550702691078186, + "step": 10020 + }, + { + "ce_loss": 0.10454921424388885, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "distill_loss": 0.0822499692440033, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "ref_ce_loss": 0.13304555416107178, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "loss": 0.5278544425964355, + "step": 10020 + }, + { + "ce_loss": 0.08236437290906906, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "distill_loss": 0.09529929608106613, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "ref_ce_loss": 0.07882020622491837, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "loss": 0.43094292283058167, + "step": 10020 + }, + { + "ce_loss": 0.10031628608703613, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "distill_loss": 0.08027870208024979, + "epoch": 3.342228152101401, + "step": 10020 + }, + { + "epoch": 3.342228152101401, + "ref_ce_loss": 0.07536637783050537, + "step": 10020 + }, + { + "epoch": 3.345563709139426, + "loss": 0.4727, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "grad_norm": 2.131840944290161, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "learning_rate": 0.00019626777613923255, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "loss": 0.2794293761253357, + "step": 10030 + }, + { + "ce_loss": 0.0968615934252739, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "distill_loss": 0.09702742844820023, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "ref_ce_loss": 0.06158556044101715, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "loss": 0.5944832563400269, + "step": 10030 + }, + { + "ce_loss": 0.2411114126443863, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "distill_loss": 0.15547247231006622, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "ref_ce_loss": 0.1431601643562317, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "loss": 0.36009153723716736, + "step": 10030 + }, + { + "ce_loss": 0.11186462640762329, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "distill_loss": 0.11474623531103134, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "ref_ce_loss": 0.1330903023481369, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "loss": 0.4779060482978821, + "step": 10030 + }, + { + "ce_loss": 0.13936035335063934, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "distill_loss": 0.13349001109600067, + "epoch": 3.345563709139426, + "step": 10030 + }, + { + "epoch": 3.345563709139426, + "ref_ce_loss": 0.1413579285144806, + "step": 10030 + }, + { + "epoch": 3.3488992661774515, + "loss": 0.4775, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "grad_norm": 2.7397146224975586, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "learning_rate": 0.00019607504948930253, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "loss": 0.500059187412262, + "step": 10040 + }, + { + "ce_loss": 0.17873555421829224, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "distill_loss": 0.1275375485420227, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "ref_ce_loss": 0.1627815067768097, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "loss": 0.3630131781101227, + "step": 10040 + }, + { + "ce_loss": 0.09446496516466141, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "distill_loss": 0.15339601039886475, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "ref_ce_loss": 0.11503797024488449, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "loss": 0.3327060639858246, + "step": 10040 + }, + { + "ce_loss": 0.07894602417945862, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "distill_loss": 0.09880810230970383, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "ref_ce_loss": 0.0752006471157074, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "loss": 0.4440487027168274, + "step": 10040 + }, + { + "ce_loss": 0.10263817757368088, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "distill_loss": 0.11173877120018005, + "epoch": 3.3488992661774515, + "step": 10040 + }, + { + "epoch": 3.3488992661774515, + "ref_ce_loss": 0.08284956961870193, + "step": 10040 + }, + { + "epoch": 3.352234823215477, + "loss": 0.4764, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "grad_norm": 3.3203084468841553, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "learning_rate": 0.0001958822388166574, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "loss": 0.4372074007987976, + "step": 10050 + }, + { + "ce_loss": 0.14007893204689026, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "distill_loss": 0.11357307434082031, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "ref_ce_loss": 0.09972081333398819, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "loss": 0.6508349776268005, + "step": 10050 + }, + { + "ce_loss": 0.1959071010351181, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "distill_loss": 0.15428954362869263, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "ref_ce_loss": 0.1650163233280182, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "loss": 0.3119848966598511, + "step": 10050 + }, + { + "ce_loss": 0.12638384103775024, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "distill_loss": 0.08100803196430206, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "ref_ce_loss": 0.07511499524116516, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "loss": 0.5597946643829346, + "step": 10050 + }, + { + "ce_loss": 0.18874359130859375, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "distill_loss": 0.12405283004045486, + "epoch": 3.352234823215477, + "step": 10050 + }, + { + "epoch": 3.352234823215477, + "ref_ce_loss": 0.1353001892566681, + "step": 10050 + }, + { + "epoch": 3.355570380253502, + "loss": 0.4504, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "grad_norm": 3.4764153957366943, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "learning_rate": 0.00019568934447290775, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "loss": 0.31696444749832153, + "step": 10060 + }, + { + "ce_loss": 0.13871832191944122, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "distill_loss": 0.10438908636569977, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "ref_ce_loss": 0.07383356243371964, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "loss": 0.6949265599250793, + "step": 10060 + }, + { + "ce_loss": 0.15461240708827972, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "distill_loss": 0.11820879578590393, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "ref_ce_loss": 0.12217464298009872, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "loss": 0.232447549700737, + "step": 10060 + }, + { + "ce_loss": 0.08912397176027298, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "distill_loss": 0.08207815885543823, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "ref_ce_loss": 0.06119540333747864, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "loss": 0.5104449987411499, + "step": 10060 + }, + { + "ce_loss": 0.1363963633775711, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "distill_loss": 0.1245986744761467, + "epoch": 3.355570380253502, + "step": 10060 + }, + { + "epoch": 3.355570380253502, + "ref_ce_loss": 0.10693434625864029, + "step": 10060 + }, + { + "epoch": 3.3589059372915275, + "loss": 0.5335, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "grad_norm": 3.6948611736297607, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "learning_rate": 0.00019549636680981673, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "loss": 0.38189783692359924, + "step": 10070 + }, + { + "ce_loss": 0.10184728354215622, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "distill_loss": 0.137140691280365, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "ref_ce_loss": 0.10864861309528351, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "loss": 0.4603230953216553, + "step": 10070 + }, + { + "ce_loss": 0.11181650310754776, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "distill_loss": 0.13628484308719635, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "ref_ce_loss": 0.11302220821380615, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "loss": 0.4682135581970215, + "step": 10070 + }, + { + "ce_loss": 0.15053050220012665, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "distill_loss": 0.16319218277931213, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "ref_ce_loss": 0.07172270119190216, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "loss": 0.20223474502563477, + "step": 10070 + }, + { + "ce_loss": 0.047858357429504395, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "distill_loss": 0.0959976464509964, + "epoch": 3.3589059372915275, + "step": 10070 + }, + { + "epoch": 3.3589059372915275, + "ref_ce_loss": 0.058139316737651825, + "step": 10070 + }, + { + "epoch": 3.362241494329553, + "loss": 0.4468, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "grad_norm": 13.602625846862793, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "learning_rate": 0.00019530330617929952, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "loss": 0.28368741273880005, + "step": 10080 + }, + { + "ce_loss": 0.06638386845588684, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "distill_loss": 0.10312242805957794, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "ref_ce_loss": 0.07346618920564651, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "loss": 0.6166037321090698, + "step": 10080 + }, + { + "ce_loss": 0.1991272121667862, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "distill_loss": 0.11051832139492035, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "ref_ce_loss": 0.11945595592260361, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "loss": 0.4137451946735382, + "step": 10080 + }, + { + "ce_loss": 0.16128048300743103, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "distill_loss": 0.14707604050636292, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "ref_ce_loss": 0.08171996474266052, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "loss": 0.2727615237236023, + "step": 10080 + }, + { + "ce_loss": 0.049319345504045486, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "distill_loss": 0.10266780853271484, + "epoch": 3.362241494329553, + "step": 10080 + }, + { + "epoch": 3.362241494329553, + "ref_ce_loss": 0.07126244157552719, + "step": 10080 + }, + { + "epoch": 3.3655770513675782, + "loss": 0.4974, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "grad_norm": 6.224396228790283, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "learning_rate": 0.0001951101629334225, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "loss": 0.5048408508300781, + "step": 10090 + }, + { + "ce_loss": 0.11753513664007187, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "distill_loss": 0.14925657212734222, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "ref_ce_loss": 0.09866927564144135, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "loss": 0.4639674425125122, + "step": 10090 + }, + { + "ce_loss": 0.12178202718496323, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "distill_loss": 0.12596401572227478, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "ref_ce_loss": 0.08418355882167816, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "loss": 0.47464287281036377, + "step": 10090 + }, + { + "ce_loss": 0.16472110152244568, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "distill_loss": 0.13395604491233826, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "ref_ce_loss": 0.11161396652460098, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "loss": 0.485629677772522, + "step": 10090 + }, + { + "ce_loss": 0.09048180282115936, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "distill_loss": 0.10109473764896393, + "epoch": 3.3655770513675782, + "step": 10090 + }, + { + "epoch": 3.3655770513675782, + "ref_ce_loss": 0.11090537160634995, + "step": 10090 + }, + { + "epoch": 3.3689126084056036, + "loss": 0.4875, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "grad_norm": 2.4241278171539307, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "learning_rate": 0.0001949169374244028, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "loss": 1.1379024982452393, + "step": 10100 + }, + { + "ce_loss": 0.20646634697914124, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "distill_loss": 0.1389389932155609, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "ref_ce_loss": 0.14189954102039337, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "loss": 0.3208090364933014, + "step": 10100 + }, + { + "ce_loss": 0.08959181606769562, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "distill_loss": 0.12985199689865112, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "ref_ce_loss": 0.06652584671974182, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "loss": 0.4916841983795166, + "step": 10100 + }, + { + "ce_loss": 0.12754502892494202, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "distill_loss": 0.13923802971839905, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "ref_ce_loss": 0.12598656117916107, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "loss": 0.2617308497428894, + "step": 10100 + }, + { + "ce_loss": 0.06644704192876816, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "distill_loss": 0.09629429876804352, + "epoch": 3.3689126084056036, + "step": 10100 + }, + { + "epoch": 3.3689126084056036, + "ref_ce_loss": 0.06436831504106522, + "step": 10100 + }, + { + "epoch": 3.372248165443629, + "loss": 0.5084, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "grad_norm": 2.8610148429870605, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "learning_rate": 0.00019472363000460756, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "loss": 0.46414175629615784, + "step": 10110 + }, + { + "ce_loss": 0.1390947699546814, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "distill_loss": 0.12784691154956818, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "ref_ce_loss": 0.10234922170639038, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "loss": 0.25928565859794617, + "step": 10110 + }, + { + "ce_loss": 0.08007047325372696, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "distill_loss": 0.10526285320520401, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "ref_ce_loss": 0.07375837862491608, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "loss": 0.30820897221565247, + "step": 10110 + }, + { + "ce_loss": 0.12451837211847305, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "distill_loss": 0.09478746354579926, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "ref_ce_loss": 0.06415072828531265, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "loss": 0.3212338984012604, + "step": 10110 + }, + { + "ce_loss": 0.09792843461036682, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "distill_loss": 0.09361623972654343, + "epoch": 3.372248165443629, + "step": 10110 + }, + { + "epoch": 3.372248165443629, + "ref_ce_loss": 0.06124284863471985, + "step": 10110 + }, + { + "epoch": 3.3755837224816543, + "loss": 0.5001, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "grad_norm": 2.521216869354248, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "learning_rate": 0.00019453024102655326, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "loss": 0.3180510103702545, + "step": 10120 + }, + { + "ce_loss": 0.10240530967712402, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "distill_loss": 0.11712464690208435, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "ref_ce_loss": 0.0984673723578453, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "loss": 0.6909222602844238, + "step": 10120 + }, + { + "ce_loss": 0.1457805633544922, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "distill_loss": 0.1715020090341568, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "ref_ce_loss": 0.10833510011434555, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "loss": 0.4989069402217865, + "step": 10120 + }, + { + "ce_loss": 0.2270646095275879, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "distill_loss": 0.18987296521663666, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "ref_ce_loss": 0.08195262402296066, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "loss": 0.38403308391571045, + "step": 10120 + }, + { + "ce_loss": 0.11043936759233475, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "distill_loss": 0.12119657546281815, + "epoch": 3.3755837224816543, + "step": 10120 + }, + { + "epoch": 3.3755837224816543, + "ref_ce_loss": 0.10974656790494919, + "step": 10120 + }, + { + "epoch": 3.3789192795196796, + "loss": 0.4831, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "grad_norm": 3.5986907482147217, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "learning_rate": 0.00019433677084290497, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "loss": 0.30134570598602295, + "step": 10130 + }, + { + "ce_loss": 0.07178352773189545, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "distill_loss": 0.0984201729297638, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "ref_ce_loss": 0.0872310921549797, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "loss": 0.7953840494155884, + "step": 10130 + }, + { + "ce_loss": 0.1972198635339737, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "distill_loss": 0.13514646887779236, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "ref_ce_loss": 0.15314488112926483, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "loss": 0.4279595613479614, + "step": 10130 + }, + { + "ce_loss": 0.1257810741662979, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "distill_loss": 0.1260966956615448, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "ref_ce_loss": 0.10996173322200775, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "loss": 0.23371756076812744, + "step": 10130 + }, + { + "ce_loss": 0.037545885890722275, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "distill_loss": 0.11940938234329224, + "epoch": 3.3789192795196796, + "step": 10130 + }, + { + "epoch": 3.3789192795196796, + "ref_ce_loss": 0.07670680433511734, + "step": 10130 + }, + { + "epoch": 3.382254836557705, + "loss": 0.517, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "grad_norm": 3.4739842414855957, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "learning_rate": 0.00019414321980647616, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "loss": 0.2659488022327423, + "step": 10140 + }, + { + "ce_loss": 0.05423908680677414, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "distill_loss": 0.11070837080478668, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "ref_ce_loss": 0.10095931589603424, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "loss": 0.4671019911766052, + "step": 10140 + }, + { + "ce_loss": 0.08684322983026505, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "distill_loss": 0.17755573987960815, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "ref_ce_loss": 0.10452690720558167, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "loss": 0.6300301551818848, + "step": 10140 + }, + { + "ce_loss": 0.1521824449300766, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "distill_loss": 0.16065070033073425, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "ref_ce_loss": 0.10154230147600174, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "loss": 0.33878257870674133, + "step": 10140 + }, + { + "ce_loss": 0.10511957108974457, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "distill_loss": 0.1427706480026245, + "epoch": 3.382254836557705, + "step": 10140 + }, + { + "epoch": 3.382254836557705, + "ref_ce_loss": 0.08972452580928802, + "step": 10140 + }, + { + "epoch": 3.3855903935957303, + "loss": 0.4769, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "grad_norm": 2.4133687019348145, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "learning_rate": 0.0001939495882702275, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "loss": 0.3416212797164917, + "step": 10150 + }, + { + "ce_loss": 0.11643853038549423, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "distill_loss": 0.13698649406433105, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "ref_ce_loss": 0.0881284549832344, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "loss": 0.5176674127578735, + "step": 10150 + }, + { + "ce_loss": 0.18818378448486328, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "distill_loss": 0.12774789333343506, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "ref_ce_loss": 0.12273141741752625, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "loss": 0.4802161157131195, + "step": 10150 + }, + { + "ce_loss": 0.10690363496541977, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "distill_loss": 0.18461987376213074, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "ref_ce_loss": 0.12223734706640244, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "loss": 0.5595582723617554, + "step": 10150 + }, + { + "ce_loss": 0.20622768998146057, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "distill_loss": 0.2150389701128006, + "epoch": 3.3855903935957303, + "step": 10150 + }, + { + "epoch": 3.3855903935957303, + "ref_ce_loss": 0.09231989085674286, + "step": 10150 + }, + { + "epoch": 3.3889259506337557, + "loss": 0.4572, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "grad_norm": 2.9398603439331055, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "learning_rate": 0.0001937558765872665, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "loss": 0.5103137493133545, + "step": 10160 + }, + { + "ce_loss": 0.07832441478967667, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "distill_loss": 0.16051746904850006, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "ref_ce_loss": 0.06761564314365387, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "loss": 0.380056232213974, + "step": 10160 + }, + { + "ce_loss": 0.1041548103094101, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "distill_loss": 0.15851038694381714, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "ref_ce_loss": 0.11733871698379517, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "loss": 0.37477728724479675, + "step": 10160 + }, + { + "ce_loss": 0.10600591450929642, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "distill_loss": 0.12407410144805908, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "ref_ce_loss": 0.1121428906917572, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "loss": 0.6696457862854004, + "step": 10160 + }, + { + "ce_loss": 0.17717599868774414, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "distill_loss": 0.16447025537490845, + "epoch": 3.3889259506337557, + "step": 10160 + }, + { + "epoch": 3.3889259506337557, + "ref_ce_loss": 0.11270187050104141, + "step": 10160 + }, + { + "epoch": 3.392261507671781, + "loss": 0.4992, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "grad_norm": 2.9642333984375, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "learning_rate": 0.00019356208511084693, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "loss": 0.8481308221817017, + "step": 10170 + }, + { + "ce_loss": 0.23037905991077423, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "distill_loss": 0.21714502573013306, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "ref_ce_loss": 0.2115284502506256, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "loss": 0.387999951839447, + "step": 10170 + }, + { + "ce_loss": 0.14844535291194916, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "distill_loss": 0.11714182794094086, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "ref_ce_loss": 0.12236713618040085, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "loss": 0.5657041072845459, + "step": 10170 + }, + { + "ce_loss": 0.2327728122472763, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "distill_loss": 0.2262754589319229, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "ref_ce_loss": 0.10661839693784714, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "loss": 0.39357078075408936, + "step": 10170 + }, + { + "ce_loss": 0.1515912264585495, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "distill_loss": 0.15029920637607574, + "epoch": 3.392261507671781, + "step": 10170 + }, + { + "epoch": 3.392261507671781, + "ref_ce_loss": 0.09164946526288986, + "step": 10170 + }, + { + "epoch": 3.3955970647098064, + "loss": 0.532, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "grad_norm": 2.959740161895752, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "learning_rate": 0.00019336821419436794, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "loss": 0.45094069838523865, + "step": 10180 + }, + { + "ce_loss": 0.1460072249174118, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "distill_loss": 0.13633152842521667, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "ref_ce_loss": 0.09167934954166412, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "loss": 0.3513057827949524, + "step": 10180 + }, + { + "ce_loss": 0.08992922306060791, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "distill_loss": 0.12288524955511093, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "ref_ce_loss": 0.10806012898683548, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "loss": 0.5274137854576111, + "step": 10180 + }, + { + "ce_loss": 0.14708077907562256, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "distill_loss": 0.14461305737495422, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "ref_ce_loss": 0.1427834928035736, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "loss": 0.6027082204818726, + "step": 10180 + }, + { + "ce_loss": 0.15740935504436493, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "distill_loss": 0.18717409670352936, + "epoch": 3.3955970647098064, + "step": 10180 + }, + { + "epoch": 3.3955970647098064, + "ref_ce_loss": 0.13125848770141602, + "step": 10180 + }, + { + "epoch": 3.3989326217478317, + "loss": 0.5064, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "grad_norm": 2.5797903537750244, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "learning_rate": 0.0001931742641913736, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "loss": 0.4065500795841217, + "step": 10190 + }, + { + "ce_loss": 0.13067033886909485, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "distill_loss": 0.13068890571594238, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "ref_ce_loss": 0.12061259895563126, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "loss": 0.43447381258010864, + "step": 10190 + }, + { + "ce_loss": 0.12058955430984497, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "distill_loss": 0.19879361987113953, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "ref_ce_loss": 0.07918906956911087, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "loss": 0.7729294300079346, + "step": 10190 + }, + { + "ce_loss": 0.1260051280260086, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "distill_loss": 0.2032921016216278, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "ref_ce_loss": 0.11601860076189041, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "loss": 0.5093587040901184, + "step": 10190 + }, + { + "ce_loss": 0.2039278894662857, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "distill_loss": 0.18354101479053497, + "epoch": 3.3989326217478317, + "step": 10190 + }, + { + "epoch": 3.3989326217478317, + "ref_ce_loss": 0.12095209211111069, + "step": 10190 + }, + { + "epoch": 3.402268178785857, + "loss": 0.4849, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "grad_norm": 2.8363871574401855, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "learning_rate": 0.00019298023545555226, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "loss": 0.30198198556900024, + "step": 10200 + }, + { + "ce_loss": 0.10195101797580719, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "distill_loss": 0.09005577862262726, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "ref_ce_loss": 0.08784396946430206, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "loss": 0.4342412054538727, + "step": 10200 + }, + { + "ce_loss": 0.10846803337335587, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "distill_loss": 0.15420377254486084, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "ref_ce_loss": 0.12385132163763046, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "loss": 0.4970320761203766, + "step": 10200 + }, + { + "ce_loss": 0.10349898785352707, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "distill_loss": 0.11978067457675934, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "ref_ce_loss": 0.049799658358097076, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "loss": 0.30252933502197266, + "step": 10200 + }, + { + "ce_loss": 0.040033504366874695, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "distill_loss": 0.08907225728034973, + "epoch": 3.402268178785857, + "step": 10200 + }, + { + "epoch": 3.402268178785857, + "ref_ce_loss": 0.10964923352003098, + "step": 10200 + }, + { + "epoch": 3.4056037358238824, + "loss": 0.4874, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "grad_norm": 3.8480939865112305, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "learning_rate": 0.00019278612834073574, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "loss": 0.4816315770149231, + "step": 10210 + }, + { + "ce_loss": 0.21080242097377777, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "distill_loss": 0.12634778022766113, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "ref_ce_loss": 0.09341747313737869, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "loss": 0.6531007289886475, + "step": 10210 + }, + { + "ce_loss": 0.15230649709701538, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "distill_loss": 0.1511775106191635, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "ref_ce_loss": 0.1311090886592865, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "loss": 0.7131592631340027, + "step": 10210 + }, + { + "ce_loss": 0.19272080063819885, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "distill_loss": 0.170637309551239, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "ref_ce_loss": 0.1068262979388237, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "loss": 0.3750826418399811, + "step": 10210 + }, + { + "ce_loss": 0.06305833905935287, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "distill_loss": 0.09007889032363892, + "epoch": 3.4056037358238824, + "step": 10210 + }, + { + "epoch": 3.4056037358238824, + "ref_ce_loss": 0.05692875385284424, + "step": 10210 + }, + { + "epoch": 3.4089392928619078, + "loss": 0.4827, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "grad_norm": 4.855068206787109, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "learning_rate": 0.00019259194320089888, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "loss": 0.365900456905365, + "step": 10220 + }, + { + "ce_loss": 0.116310253739357, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "distill_loss": 0.11983893066644669, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "ref_ce_loss": 0.08262918144464493, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "loss": 0.45967572927474976, + "step": 10220 + }, + { + "ce_loss": 0.156527578830719, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "distill_loss": 0.13135460019111633, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "ref_ce_loss": 0.13319435715675354, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "loss": 0.4894421696662903, + "step": 10220 + }, + { + "ce_loss": 0.138182133436203, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "distill_loss": 0.12863339483737946, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "ref_ce_loss": 0.14758390188217163, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "loss": 0.3858674168586731, + "step": 10220 + }, + { + "ce_loss": 0.0867292508482933, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "distill_loss": 0.13426059484481812, + "epoch": 3.4089392928619078, + "step": 10220 + }, + { + "epoch": 3.4089392928619078, + "ref_ce_loss": 0.1308407485485077, + "step": 10220 + }, + { + "epoch": 3.412274849899933, + "loss": 0.4488, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "grad_norm": 2.2313504219055176, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "learning_rate": 0.00019239768039015884, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "loss": 0.4219973087310791, + "step": 10230 + }, + { + "ce_loss": 0.1498318910598755, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "distill_loss": 0.1641000658273697, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "ref_ce_loss": 0.10786410421133041, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "loss": 0.4946644902229309, + "step": 10230 + }, + { + "ce_loss": 0.17174388468265533, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "distill_loss": 0.15777334570884705, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "ref_ce_loss": 0.13005994260311127, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "loss": 0.7625753879547119, + "step": 10230 + }, + { + "ce_loss": 0.2250397950410843, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "distill_loss": 0.27483320236206055, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "ref_ce_loss": 0.11940769106149673, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "loss": 0.3382866680622101, + "step": 10230 + }, + { + "ce_loss": 0.10599769651889801, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "distill_loss": 0.1568976789712906, + "epoch": 3.412274849899933, + "step": 10230 + }, + { + "epoch": 3.412274849899933, + "ref_ce_loss": 0.07457992434501648, + "step": 10230 + }, + { + "epoch": 3.4156104069379585, + "loss": 0.5524, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "grad_norm": 5.488867282867432, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "learning_rate": 0.0001922033402627742, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "loss": 0.5070091485977173, + "step": 10240 + }, + { + "ce_loss": 0.14857244491577148, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "distill_loss": 0.17652364075183868, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "ref_ce_loss": 0.07085951417684555, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "loss": 0.5407269597053528, + "step": 10240 + }, + { + "ce_loss": 0.24016886949539185, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "distill_loss": 0.17092876136302948, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "ref_ce_loss": 0.12917813658714294, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "loss": 0.35367733240127563, + "step": 10240 + }, + { + "ce_loss": 0.10315673053264618, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "distill_loss": 0.11190631985664368, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "ref_ce_loss": 0.09125013649463654, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "loss": 0.39612677693367004, + "step": 10240 + }, + { + "ce_loss": 0.1108587235212326, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "distill_loss": 0.09831427782773972, + "epoch": 3.4156104069379585, + "step": 10240 + }, + { + "epoch": 3.4156104069379585, + "ref_ce_loss": 0.0691721960902214, + "step": 10240 + }, + { + "epoch": 3.418945963975984, + "loss": 0.4671, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "grad_norm": 3.53739595413208, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "learning_rate": 0.00019200892317314486, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "loss": 0.4813249707221985, + "step": 10250 + }, + { + "ce_loss": 0.10285835713148117, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "distill_loss": 0.11528073996305466, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "ref_ce_loss": 0.10409308969974518, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "loss": 0.943084180355072, + "step": 10250 + }, + { + "ce_loss": 0.09678593277931213, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "distill_loss": 0.13432377576828003, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "ref_ce_loss": 0.07842497527599335, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "loss": 0.5886873602867126, + "step": 10250 + }, + { + "ce_loss": 0.1650712937116623, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "distill_loss": 0.170527845621109, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "ref_ce_loss": 0.15662546455860138, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "loss": 0.46970099210739136, + "step": 10250 + }, + { + "ce_loss": 0.13701611757278442, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "distill_loss": 0.130446657538414, + "epoch": 3.418945963975984, + "step": 10250 + }, + { + "epoch": 3.418945963975984, + "ref_ce_loss": 0.07624396681785583, + "step": 10250 + }, + { + "epoch": 3.422281521014009, + "loss": 0.5555, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "grad_norm": 2.5280873775482178, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "learning_rate": 0.00019181442947581074, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "loss": 0.29995018243789673, + "step": 10260 + }, + { + "ce_loss": 0.07752680778503418, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "distill_loss": 0.11020202189683914, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "ref_ce_loss": 0.1116335466504097, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "loss": 0.38425710797309875, + "step": 10260 + }, + { + "ce_loss": 0.10159788280725479, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "distill_loss": 0.20541100203990936, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "ref_ce_loss": 0.07702256739139557, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "loss": 0.39652562141418457, + "step": 10260 + }, + { + "ce_loss": 0.07538190484046936, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "distill_loss": 0.08564054220914841, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "ref_ce_loss": 0.07945127040147781, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "loss": 0.24914473295211792, + "step": 10260 + }, + { + "ce_loss": 0.04793926328420639, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "distill_loss": 0.07164011895656586, + "epoch": 3.422281521014009, + "step": 10260 + }, + { + "epoch": 3.422281521014009, + "ref_ce_loss": 0.07984654605388641, + "step": 10260 + }, + { + "epoch": 3.4256170780520345, + "loss": 0.4457, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "grad_norm": 5.044663429260254, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "learning_rate": 0.00019161985952545173, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "loss": 0.5661646127700806, + "step": 10270 + }, + { + "ce_loss": 0.18041296303272247, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "distill_loss": 0.15173012018203735, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "ref_ce_loss": 0.13187175989151, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "loss": 0.5040100812911987, + "step": 10270 + }, + { + "ce_loss": 0.16350196301937103, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "distill_loss": 0.17784355580806732, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "ref_ce_loss": 0.1626138836145401, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "loss": 0.44560155272483826, + "step": 10270 + }, + { + "ce_loss": 0.11676087230443954, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "distill_loss": 0.11542589217424393, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "ref_ce_loss": 0.09448987245559692, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "loss": 0.3565663695335388, + "step": 10270 + }, + { + "ce_loss": 0.06954836845397949, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "distill_loss": 0.12725864350795746, + "epoch": 3.4256170780520345, + "step": 10270 + }, + { + "epoch": 3.4256170780520345, + "ref_ce_loss": 0.13181714713573456, + "step": 10270 + }, + { + "epoch": 3.42895263509006, + "loss": 0.4707, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "grad_norm": 2.253469228744507, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "learning_rate": 0.0001914252136768867, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "loss": 0.3005276322364807, + "step": 10280 + }, + { + "ce_loss": 0.05415527522563934, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "distill_loss": 0.10406085848808289, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "ref_ce_loss": 0.07171543687582016, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "loss": 0.30626076459884644, + "step": 10280 + }, + { + "ce_loss": 0.10488387942314148, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "distill_loss": 0.11459501087665558, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "ref_ce_loss": 0.08659002184867859, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "loss": 0.6513745188713074, + "step": 10280 + }, + { + "ce_loss": 0.19871819019317627, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "distill_loss": 0.18216168880462646, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "ref_ce_loss": 0.11713993549346924, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "loss": 0.5560740232467651, + "step": 10280 + }, + { + "ce_loss": 0.13708558678627014, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "distill_loss": 0.17769046127796173, + "epoch": 3.42895263509006, + "step": 10280 + }, + { + "epoch": 3.42895263509006, + "ref_ce_loss": 0.1358463168144226, + "step": 10280 + }, + { + "epoch": 3.4322881921280852, + "loss": 0.4525, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "grad_norm": 2.5623693466186523, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "learning_rate": 0.00019123049228507278, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "loss": 0.2555229961872101, + "step": 10290 + }, + { + "ce_loss": 0.08814626187086105, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "distill_loss": 0.09802016615867615, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "ref_ce_loss": 0.06921228021383286, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "loss": 0.4880223870277405, + "step": 10290 + }, + { + "ce_loss": 0.19454342126846313, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "distill_loss": 0.14570696651935577, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "ref_ce_loss": 0.12097126245498657, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "loss": 0.8403055667877197, + "step": 10290 + }, + { + "ce_loss": 0.15819591283798218, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "distill_loss": 0.18111252784729004, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "ref_ce_loss": 0.0896746814250946, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "loss": 0.6239392757415771, + "step": 10290 + }, + { + "ce_loss": 0.14710165560245514, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "distill_loss": 0.19059288501739502, + "epoch": 3.4322881921280852, + "step": 10290 + }, + { + "epoch": 3.4322881921280852, + "ref_ce_loss": 0.10249871760606766, + "step": 10290 + }, + { + "epoch": 3.4356237491661106, + "loss": 0.546, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "grad_norm": 4.703062057495117, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "learning_rate": 0.000191035695705105, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "loss": 0.304233580827713, + "step": 10300 + }, + { + "ce_loss": 0.08161269873380661, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "distill_loss": 0.12234167754650116, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "ref_ce_loss": 0.08137601613998413, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "loss": 0.3445499539375305, + "step": 10300 + }, + { + "ce_loss": 0.1015457957983017, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "distill_loss": 0.13137434422969818, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "ref_ce_loss": 0.08375562727451324, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "loss": 0.404718279838562, + "step": 10300 + }, + { + "ce_loss": 0.09954530745744705, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "distill_loss": 0.15256458520889282, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "ref_ce_loss": 0.12649698555469513, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "loss": 0.5050632953643799, + "step": 10300 + }, + { + "ce_loss": 0.07534976303577423, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "distill_loss": 0.12757937610149384, + "epoch": 3.4356237491661106, + "step": 10300 + }, + { + "epoch": 3.4356237491661106, + "ref_ce_loss": 0.10308182239532471, + "step": 10300 + }, + { + "epoch": 3.438959306204136, + "loss": 0.4814, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "grad_norm": 2.611424446105957, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "learning_rate": 0.00019084082429221558, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "loss": 0.4377182722091675, + "step": 10310 + }, + { + "ce_loss": 0.09518056362867355, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "distill_loss": 0.12689684331417084, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "ref_ce_loss": 0.16369374096393585, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "loss": 0.6643956899642944, + "step": 10310 + }, + { + "ce_loss": 0.16947884857654572, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "distill_loss": 0.1439935564994812, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "ref_ce_loss": 0.11696118861436844, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "loss": 0.5247431397438049, + "step": 10310 + }, + { + "ce_loss": 0.20082227885723114, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "distill_loss": 0.14931391179561615, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "ref_ce_loss": 0.12185055762529373, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "loss": 0.3367827534675598, + "step": 10310 + }, + { + "ce_loss": 0.09930837154388428, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "distill_loss": 0.1177770271897316, + "epoch": 3.438959306204136, + "step": 10310 + }, + { + "epoch": 3.438959306204136, + "ref_ce_loss": 0.08937904238700867, + "step": 10310 + }, + { + "epoch": 3.4422948632421613, + "loss": 0.5355, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "grad_norm": 2.856062889099121, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "learning_rate": 0.00019064587840177306, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "loss": 0.3735664486885071, + "step": 10320 + }, + { + "ce_loss": 0.1009257510304451, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "distill_loss": 0.11772341281175613, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "ref_ce_loss": 0.08748678863048553, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "loss": 0.4337637424468994, + "step": 10320 + }, + { + "ce_loss": 0.08262146264314651, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "distill_loss": 0.21009844541549683, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "ref_ce_loss": 0.10304173082113266, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "loss": 0.3304564654827118, + "step": 10320 + }, + { + "ce_loss": 0.11042464524507523, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "distill_loss": 0.10334540903568268, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "ref_ce_loss": 0.08047734946012497, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "loss": 0.45189177989959717, + "step": 10320 + }, + { + "ce_loss": 0.1377006471157074, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "distill_loss": 0.1019442155957222, + "epoch": 3.4422948632421613, + "step": 10320 + }, + { + "epoch": 3.4422948632421613, + "ref_ce_loss": 0.06532624363899231, + "step": 10320 + }, + { + "epoch": 3.4456304202801866, + "loss": 0.4176, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "grad_norm": 4.124449729919434, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "learning_rate": 0.00019045085838928174, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "loss": 0.43326324224472046, + "step": 10330 + }, + { + "ce_loss": 0.14759215712547302, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "distill_loss": 0.1370130479335785, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "ref_ce_loss": 0.10759326070547104, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "loss": 0.392121285200119, + "step": 10330 + }, + { + "ce_loss": 0.1546168476343155, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "distill_loss": 0.13393008708953857, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "ref_ce_loss": 0.1034577339887619, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "loss": 0.42810681462287903, + "step": 10330 + }, + { + "ce_loss": 0.18438415229320526, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "distill_loss": 0.14138942956924438, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "ref_ce_loss": 0.10176559537649155, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "loss": 0.42089158296585083, + "step": 10330 + }, + { + "ce_loss": 0.11071355640888214, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "distill_loss": 0.10371338576078415, + "epoch": 3.4456304202801866, + "step": 10330 + }, + { + "epoch": 3.4456304202801866, + "ref_ce_loss": 0.12404021620750427, + "step": 10330 + }, + { + "epoch": 3.448965977318212, + "loss": 0.4524, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "grad_norm": 3.522857189178467, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "learning_rate": 0.00019025576461038134, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "loss": 0.5325416922569275, + "step": 10340 + }, + { + "ce_loss": 0.231018528342247, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "distill_loss": 0.11983978748321533, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "ref_ce_loss": 0.11137314885854721, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "loss": 0.3700246214866638, + "step": 10340 + }, + { + "ce_loss": 0.12125429511070251, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "distill_loss": 0.12481550872325897, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "ref_ce_loss": 0.08128783106803894, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "loss": 0.3559412360191345, + "step": 10340 + }, + { + "ce_loss": 0.10249993205070496, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "distill_loss": 0.09033988416194916, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "ref_ce_loss": 0.0899801105260849, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "loss": 0.5967211127281189, + "step": 10340 + }, + { + "ce_loss": 0.1554853916168213, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "distill_loss": 0.1830393224954605, + "epoch": 3.448965977318212, + "step": 10340 + }, + { + "epoch": 3.448965977318212, + "ref_ce_loss": 0.12638594210147858, + "step": 10340 + }, + { + "epoch": 3.4523015343562373, + "loss": 0.4761, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "grad_norm": 2.2772934436798096, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "learning_rate": 0.0001900605974208459, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "loss": 0.28337162733078003, + "step": 10350 + }, + { + "ce_loss": 0.05180773138999939, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "distill_loss": 0.0789615586400032, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "ref_ce_loss": 0.06448401510715485, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "loss": 0.2962020933628082, + "step": 10350 + }, + { + "ce_loss": 0.1171887069940567, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "distill_loss": 0.08088953793048859, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "ref_ce_loss": 0.09784910082817078, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "loss": 0.7583683133125305, + "step": 10350 + }, + { + "ce_loss": 0.1913074254989624, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "distill_loss": 0.17555482685565948, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "ref_ce_loss": 0.1408015340566635, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "loss": 0.487335205078125, + "step": 10350 + }, + { + "ce_loss": 0.1990186721086502, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "distill_loss": 0.11871914565563202, + "epoch": 3.4523015343562373, + "step": 10350 + }, + { + "epoch": 3.4523015343562373, + "ref_ce_loss": 0.11281263083219528, + "step": 10350 + }, + { + "epoch": 3.4556370913942627, + "loss": 0.4634, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "grad_norm": 2.651352882385254, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "learning_rate": 0.00018986535717658334, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "loss": 0.6339795589447021, + "step": 10360 + }, + { + "ce_loss": 0.20450928807258606, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "distill_loss": 0.15441319346427917, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "ref_ce_loss": 0.14329175651073456, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "loss": 0.8533214926719666, + "step": 10360 + }, + { + "ce_loss": 0.07007604837417603, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "distill_loss": 0.13101819157600403, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "ref_ce_loss": 0.10507778078317642, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "loss": 0.41858166456222534, + "step": 10360 + }, + { + "ce_loss": 0.0904201790690422, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "distill_loss": 0.11084774136543274, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "ref_ce_loss": 0.14289547502994537, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "loss": 0.4427366554737091, + "step": 10360 + }, + { + "ce_loss": 0.15832187235355377, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "distill_loss": 0.1405555158853531, + "epoch": 3.4556370913942627, + "step": 10360 + }, + { + "epoch": 3.4556370913942627, + "ref_ce_loss": 0.08129150420427322, + "step": 10360 + }, + { + "epoch": 3.458972648432288, + "loss": 0.5116, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "grad_norm": 1.8335307836532593, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "learning_rate": 0.0001896700442336349, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "loss": 1.324198603630066, + "step": 10370 + }, + { + "ce_loss": 0.11334358155727386, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "distill_loss": 0.12460387498140335, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "ref_ce_loss": 0.06593289226293564, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "loss": 0.47392234206199646, + "step": 10370 + }, + { + "ce_loss": 0.16286267340183258, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "distill_loss": 0.14801934361457825, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "ref_ce_loss": 0.1139780580997467, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "loss": 0.513584554195404, + "step": 10370 + }, + { + "ce_loss": 0.15807706117630005, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "distill_loss": 0.19912990927696228, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "ref_ce_loss": 0.11430535465478897, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "loss": 0.4971799850463867, + "step": 10370 + }, + { + "ce_loss": 0.14749285578727722, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "distill_loss": 0.16247427463531494, + "epoch": 3.458972648432288, + "step": 10370 + }, + { + "epoch": 3.458972648432288, + "ref_ce_loss": 0.1421637386083603, + "step": 10370 + }, + { + "epoch": 3.4623082054703134, + "loss": 0.544, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "grad_norm": 2.416501522064209, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "learning_rate": 0.00018947465894817434, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "loss": 0.8389631509780884, + "step": 10380 + }, + { + "ce_loss": 0.19231632351875305, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "distill_loss": 0.19508925080299377, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "ref_ce_loss": 0.1727515161037445, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "loss": 0.4167601764202118, + "step": 10380 + }, + { + "ce_loss": 0.13529586791992188, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "distill_loss": 0.13505946099758148, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "ref_ce_loss": 0.1462845355272293, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "loss": 0.42783311009407043, + "step": 10380 + }, + { + "ce_loss": 0.1599048376083374, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "distill_loss": 0.12761659920215607, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "ref_ce_loss": 0.09368833154439926, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "loss": 0.6533681154251099, + "step": 10380 + }, + { + "ce_loss": 0.14638778567314148, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "distill_loss": 0.1574423760175705, + "epoch": 3.4623082054703134, + "step": 10380 + }, + { + "epoch": 3.4623082054703134, + "ref_ce_loss": 0.1208135262131691, + "step": 10380 + }, + { + "epoch": 3.4656437625083387, + "loss": 0.456, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "grad_norm": 2.474968671798706, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "learning_rate": 0.00018927920167650735, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "loss": 0.5869070887565613, + "step": 10390 + }, + { + "ce_loss": 0.2259010523557663, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "distill_loss": 0.17279842495918274, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "ref_ce_loss": 0.13575266301631927, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "loss": 0.6994563341140747, + "step": 10390 + }, + { + "ce_loss": 0.22502291202545166, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "distill_loss": 0.17224964499473572, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "ref_ce_loss": 0.11385179311037064, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "loss": 0.38169121742248535, + "step": 10390 + }, + { + "ce_loss": 0.07678114622831345, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "distill_loss": 0.0935371145606041, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "ref_ce_loss": 0.06890590488910675, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "loss": 0.7031863927841187, + "step": 10390 + }, + { + "ce_loss": 0.1995294988155365, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "distill_loss": 0.1283656358718872, + "epoch": 3.4656437625083387, + "step": 10390 + }, + { + "epoch": 3.4656437625083387, + "ref_ce_loss": 0.0902036800980568, + "step": 10390 + }, + { + "epoch": 3.468979319546364, + "loss": 0.4899, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "grad_norm": 2.958378314971924, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "learning_rate": 0.0001890836727750709, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "loss": 0.4847678542137146, + "step": 10400 + }, + { + "ce_loss": 0.23173287510871887, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "distill_loss": 0.1382112205028534, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "ref_ce_loss": 0.11151211708784103, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "loss": 0.440000057220459, + "step": 10400 + }, + { + "ce_loss": 0.1410650759935379, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "distill_loss": 0.11761602759361267, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "ref_ce_loss": 0.1356046199798584, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "loss": 0.6042724847793579, + "step": 10400 + }, + { + "ce_loss": 0.17960430681705475, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "distill_loss": 0.13856202363967896, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "ref_ce_loss": 0.07231870293617249, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "loss": 0.4100415110588074, + "step": 10400 + }, + { + "ce_loss": 0.15962854027748108, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "distill_loss": 0.12709757685661316, + "epoch": 3.468979319546364, + "step": 10400 + }, + { + "epoch": 3.468979319546364, + "ref_ce_loss": 0.0880059152841568, + "step": 10400 + }, + { + "epoch": 3.4723148765843894, + "loss": 0.4718, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "grad_norm": 2.205946683883667, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "learning_rate": 0.00018888807260043249, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "loss": 0.5133829116821289, + "step": 10410 + }, + { + "ce_loss": 0.13501222431659698, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "distill_loss": 0.11960557103157043, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "ref_ce_loss": 0.09122700989246368, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "loss": 0.3817310631275177, + "step": 10410 + }, + { + "ce_loss": 0.07214323431253433, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "distill_loss": 0.1570921242237091, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "ref_ce_loss": 0.11837377399206161, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "loss": 0.530503511428833, + "step": 10410 + }, + { + "ce_loss": 0.23058447241783142, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "distill_loss": 0.16666772961616516, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "ref_ce_loss": 0.10783181339502335, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "loss": 0.4435175955295563, + "step": 10410 + }, + { + "ce_loss": 0.11766248196363449, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "distill_loss": 0.1328033208847046, + "epoch": 3.4723148765843894, + "step": 10410 + }, + { + "epoch": 3.4723148765843894, + "ref_ce_loss": 0.11009076982736588, + "step": 10410 + }, + { + "epoch": 3.4756504336224148, + "loss": 0.5215, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "grad_norm": 5.0243682861328125, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "learning_rate": 0.0001886924015092898, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "loss": 0.8781328201293945, + "step": 10420 + }, + { + "ce_loss": 0.24173305928707123, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "distill_loss": 0.16323822736740112, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "ref_ce_loss": 0.14663997292518616, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "loss": 0.38078439235687256, + "step": 10420 + }, + { + "ce_loss": 0.1302793025970459, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "distill_loss": 0.1325978934764862, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "ref_ce_loss": 0.0761353150010109, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "loss": 0.5718013048171997, + "step": 10420 + }, + { + "ce_loss": 0.160188689827919, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "distill_loss": 0.13954801857471466, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "ref_ce_loss": 0.10986100137233734, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "loss": 0.44210585951805115, + "step": 10420 + }, + { + "ce_loss": 0.12488564103841782, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "distill_loss": 0.13288307189941406, + "epoch": 3.4756504336224148, + "step": 10420 + }, + { + "epoch": 3.4756504336224148, + "ref_ce_loss": 0.07312694936990738, + "step": 10420 + }, + { + "epoch": 3.47898599066044, + "loss": 0.5233, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "grad_norm": 4.157192230224609, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "learning_rate": 0.00018849665985846967, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "loss": 0.4405640959739685, + "step": 10430 + }, + { + "ce_loss": 0.0566069558262825, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "distill_loss": 0.10539036989212036, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "ref_ce_loss": 0.07350980490446091, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "loss": 0.3644062876701355, + "step": 10430 + }, + { + "ce_loss": 0.11124683171510696, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "distill_loss": 0.11445042490959167, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "ref_ce_loss": 0.10792446881532669, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "loss": 0.23093454539775848, + "step": 10430 + }, + { + "ce_loss": 0.07317502796649933, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "distill_loss": 0.09828302264213562, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "ref_ce_loss": 0.059230588376522064, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "loss": 0.40664955973625183, + "step": 10430 + }, + { + "ce_loss": 0.1365078091621399, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "distill_loss": 0.12795759737491608, + "epoch": 3.47898599066044, + "step": 10430 + }, + { + "epoch": 3.47898599066044, + "ref_ce_loss": 0.09858806431293488, + "step": 10430 + }, + { + "epoch": 3.4823215476984655, + "loss": 0.4908, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "grad_norm": 7.698869228363037, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "learning_rate": 0.0001883008480049276, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "loss": 0.8871216773986816, + "step": 10440 + }, + { + "ce_loss": 0.09219731390476227, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "distill_loss": 0.12486536055803299, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "ref_ce_loss": 0.12163364142179489, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "loss": 0.4638303816318512, + "step": 10440 + }, + { + "ce_loss": 0.09351971000432968, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "distill_loss": 0.17921598255634308, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "ref_ce_loss": 0.06343629211187363, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "loss": 0.3681568503379822, + "step": 10440 + }, + { + "ce_loss": 0.048713743686676025, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "distill_loss": 0.14604920148849487, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "ref_ce_loss": 0.054822877049446106, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "loss": 0.44607189297676086, + "step": 10440 + }, + { + "ce_loss": 0.16012151539325714, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "distill_loss": 0.12422513961791992, + "epoch": 3.4823215476984655, + "step": 10440 + }, + { + "epoch": 3.4823215476984655, + "ref_ce_loss": 0.1300317794084549, + "step": 10440 + }, + { + "epoch": 3.485657104736491, + "loss": 0.5404, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "grad_norm": 2.3111071586608887, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "learning_rate": 0.0001881049663057473, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "loss": 0.6618536710739136, + "step": 10450 + }, + { + "ce_loss": 0.19961726665496826, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "distill_loss": 0.1706632822751999, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "ref_ce_loss": 0.13822656869888306, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "loss": 0.31794148683547974, + "step": 10450 + }, + { + "ce_loss": 0.09808186441659927, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "distill_loss": 0.131150022149086, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "ref_ce_loss": 0.08795040100812912, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "loss": 0.6011213064193726, + "step": 10450 + }, + { + "ce_loss": 0.08202079683542252, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "distill_loss": 0.11499512195587158, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "ref_ce_loss": 0.1003682017326355, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "loss": 0.33552274107933044, + "step": 10450 + }, + { + "ce_loss": 0.06787855923175812, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "distill_loss": 0.10218380391597748, + "epoch": 3.485657104736491, + "step": 10450 + }, + { + "epoch": 3.485657104736491, + "ref_ce_loss": 0.1092182844877243, + "step": 10450 + }, + { + "epoch": 3.488992661774516, + "loss": 0.4999, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "grad_norm": 2.911756753921509, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "learning_rate": 0.00018790901511813962, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "loss": 0.47455549240112305, + "step": 10460 + }, + { + "ce_loss": 0.1233755499124527, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "distill_loss": 0.11691313236951828, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "ref_ce_loss": 0.1434135138988495, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "loss": 0.5427463054656982, + "step": 10460 + }, + { + "ce_loss": 0.09475378692150116, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "distill_loss": 0.15717893838882446, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "ref_ce_loss": 0.11762606352567673, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "loss": 1.103336215019226, + "step": 10460 + }, + { + "ce_loss": 0.22373856604099274, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "distill_loss": 0.22364136576652527, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "ref_ce_loss": 0.11285527795553207, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "loss": 0.49190598726272583, + "step": 10460 + }, + { + "ce_loss": 0.09300311654806137, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "distill_loss": 0.16091318428516388, + "epoch": 3.488992661774516, + "step": 10460 + }, + { + "epoch": 3.488992661774516, + "ref_ce_loss": 0.09601572901010513, + "step": 10460 + }, + { + "epoch": 3.4923282188125415, + "loss": 0.4919, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "grad_norm": 3.8968892097473145, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "learning_rate": 0.00018771299479944218, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "loss": 0.6155132055282593, + "step": 10470 + }, + { + "ce_loss": 0.07882112264633179, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "distill_loss": 0.1573733240365982, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "ref_ce_loss": 0.13969795405864716, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "loss": 0.5589838624000549, + "step": 10470 + }, + { + "ce_loss": 0.24933810532093048, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "distill_loss": 0.11963921785354614, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "ref_ce_loss": 0.12221920490264893, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "loss": 0.5622706413269043, + "step": 10470 + }, + { + "ce_loss": 0.17802667617797852, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "distill_loss": 0.15607579052448273, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "ref_ce_loss": 0.1545151174068451, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "loss": 0.43948787450790405, + "step": 10470 + }, + { + "ce_loss": 0.155122771859169, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "distill_loss": 0.16137826442718506, + "epoch": 3.4923282188125415, + "step": 10470 + }, + { + "epoch": 3.4923282188125415, + "ref_ce_loss": 0.12277895212173462, + "step": 10470 + }, + { + "epoch": 3.495663775850567, + "loss": 0.4717, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "grad_norm": 3.666346311569214, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "learning_rate": 0.00018751690570711885, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "loss": 0.2625570297241211, + "step": 10480 + }, + { + "ce_loss": 0.07310660928487778, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "distill_loss": 0.08818459510803223, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "ref_ce_loss": 0.07402611523866653, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "loss": 0.49244633316993713, + "step": 10480 + }, + { + "ce_loss": 0.13064201176166534, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "distill_loss": 0.15354804694652557, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "ref_ce_loss": 0.11334028840065002, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "loss": 0.42023003101348877, + "step": 10480 + }, + { + "ce_loss": 0.0777747705578804, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "distill_loss": 0.12232057750225067, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "ref_ce_loss": 0.09364210814237595, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "loss": 0.3053067624568939, + "step": 10480 + }, + { + "ce_loss": 0.09775067120790482, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "distill_loss": 0.12994107604026794, + "epoch": 3.495663775850567, + "step": 10480 + }, + { + "epoch": 3.495663775850567, + "ref_ce_loss": 0.07745039463043213, + "step": 10480 + }, + { + "epoch": 3.498999332888592, + "loss": 0.4981, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "grad_norm": 2.8985745906829834, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "learning_rate": 0.00018732074819875872, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "loss": 0.3826335668563843, + "step": 10490 + }, + { + "ce_loss": 0.09962109476327896, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "distill_loss": 0.11335495859384537, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "ref_ce_loss": 0.07441458851099014, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "loss": 0.4468466639518738, + "step": 10490 + }, + { + "ce_loss": 0.06108476594090462, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "distill_loss": 0.10852596163749695, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "ref_ce_loss": 0.07920630276203156, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "loss": 0.42822498083114624, + "step": 10490 + }, + { + "ce_loss": 0.12201324850320816, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "distill_loss": 0.10355043411254883, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "ref_ce_loss": 0.15949228405952454, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "loss": 0.5487578511238098, + "step": 10490 + }, + { + "ce_loss": 0.17656852304935455, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "distill_loss": 0.14200134575366974, + "epoch": 3.498999332888592, + "step": 10490 + }, + { + "epoch": 3.498999332888592, + "ref_ce_loss": 0.10942427814006805, + "step": 10490 + }, + { + "epoch": 3.502334889926618, + "loss": 0.472, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "grad_norm": 1.802612543106079, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "learning_rate": 0.0001871245226320757, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "loss": 0.6035943031311035, + "step": 10500 + }, + { + "ce_loss": 0.17178229987621307, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "distill_loss": 0.14971838891506195, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "ref_ce_loss": 0.0989990159869194, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "loss": 0.36782369017601013, + "step": 10500 + }, + { + "ce_loss": 0.12339980900287628, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "distill_loss": 0.14437860250473022, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "ref_ce_loss": 0.09994472563266754, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "loss": 0.4430643916130066, + "step": 10500 + }, + { + "ce_loss": 0.17111869156360626, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "distill_loss": 0.1422690451145172, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "ref_ce_loss": 0.1295473873615265, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "loss": 0.5198507905006409, + "step": 10500 + }, + { + "ce_loss": 0.07539082318544388, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "distill_loss": 0.13079798221588135, + "epoch": 3.502334889926618, + "step": 10500 + }, + { + "epoch": 3.502334889926618, + "ref_ce_loss": 0.1290043443441391, + "step": 10500 + }, + { + "epoch": 3.5056704469646434, + "loss": 0.5079, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "grad_norm": 4.614372253417969, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "learning_rate": 0.00018692822936490784, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "loss": 0.41191405057907104, + "step": 10510 + }, + { + "ce_loss": 0.06786329299211502, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "distill_loss": 0.1034030169248581, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "ref_ce_loss": 0.1216384768486023, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "loss": 0.4477238655090332, + "step": 10510 + }, + { + "ce_loss": 0.11512485891580582, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "distill_loss": 0.13957633078098297, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "ref_ce_loss": 0.09763330221176147, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "loss": 0.6907591223716736, + "step": 10510 + }, + { + "ce_loss": 0.20804905891418457, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "distill_loss": 0.21027851104736328, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "ref_ce_loss": 0.11685810983181, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "loss": 0.5303629636764526, + "step": 10510 + }, + { + "ce_loss": 0.17830605804920197, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "distill_loss": 0.20348048210144043, + "epoch": 3.5056704469646434, + "step": 10510 + }, + { + "epoch": 3.5056704469646434, + "ref_ce_loss": 0.10697298496961594, + "step": 10510 + }, + { + "epoch": 3.5090060040026687, + "loss": 0.5298, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "grad_norm": 2.2804954051971436, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "learning_rate": 0.00018673186875521657, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "loss": 0.39185506105422974, + "step": 10520 + }, + { + "ce_loss": 0.15698541700839996, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "distill_loss": 0.1254022717475891, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "ref_ce_loss": 0.10929703712463379, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "loss": 0.27361857891082764, + "step": 10520 + }, + { + "ce_loss": 0.10073763132095337, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "distill_loss": 0.1090550348162651, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "ref_ce_loss": 0.06371267884969711, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "loss": 0.47119683027267456, + "step": 10520 + }, + { + "ce_loss": 0.1285894811153412, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "distill_loss": 0.15611502528190613, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "ref_ce_loss": 0.1049136221408844, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "loss": 0.28055694699287415, + "step": 10520 + }, + { + "ce_loss": 0.037441760301589966, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "distill_loss": 0.09433382004499435, + "epoch": 3.5090060040026687, + "step": 10520 + }, + { + "epoch": 3.5090060040026687, + "ref_ce_loss": 0.07896076887845993, + "step": 10520 + }, + { + "epoch": 3.512341561040694, + "loss": 0.4552, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "grad_norm": 2.3658037185668945, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "learning_rate": 0.00018653544116108625, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "loss": 0.5972455143928528, + "step": 10530 + }, + { + "ce_loss": 0.16446569561958313, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "distill_loss": 0.1522887945175171, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "ref_ce_loss": 0.1197158694267273, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "loss": 0.522158682346344, + "step": 10530 + }, + { + "ce_loss": 0.250407874584198, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "distill_loss": 0.12939266860485077, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "ref_ce_loss": 0.11329744011163712, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "loss": 0.294116348028183, + "step": 10530 + }, + { + "ce_loss": 0.09095935523509979, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "distill_loss": 0.10605276376008987, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "ref_ce_loss": 0.054930780082941055, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "loss": 0.24210438132286072, + "step": 10530 + }, + { + "ce_loss": 0.029579443857073784, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "distill_loss": 0.1047045886516571, + "epoch": 3.512341561040694, + "step": 10530 + }, + { + "epoch": 3.512341561040694, + "ref_ce_loss": 0.05087565630674362, + "step": 10530 + }, + { + "epoch": 3.5156771180787194, + "loss": 0.5145, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "grad_norm": 2.366520643234253, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "learning_rate": 0.00018633894694072337, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "loss": 0.6220971941947937, + "step": 10540 + }, + { + "ce_loss": 0.12055147439241409, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "distill_loss": 0.1395721137523651, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "ref_ce_loss": 0.17843414843082428, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "loss": 0.5132659673690796, + "step": 10540 + }, + { + "ce_loss": 0.11431053280830383, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "distill_loss": 0.10479498654603958, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "ref_ce_loss": 0.12466438114643097, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "loss": 0.805692732334137, + "step": 10540 + }, + { + "ce_loss": 0.22681838274002075, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "distill_loss": 0.16361652314662933, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "ref_ce_loss": 0.1392831951379776, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "loss": 0.5683649182319641, + "step": 10540 + }, + { + "ce_loss": 0.1613687127828598, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "distill_loss": 0.17765222489833832, + "epoch": 3.5156771180787194, + "step": 10540 + }, + { + "epoch": 3.5156771180787194, + "ref_ce_loss": 0.15474840998649597, + "step": 10540 + }, + { + "epoch": 3.5190126751167448, + "loss": 0.5035, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "grad_norm": 2.3484420776367188, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "learning_rate": 0.00018614238645245574, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "loss": 0.3599805235862732, + "step": 10550 + }, + { + "ce_loss": 0.11466601490974426, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "distill_loss": 0.15461599826812744, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "ref_ce_loss": 0.08937636762857437, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "loss": 0.5544697046279907, + "step": 10550 + }, + { + "ce_loss": 0.09430625289678574, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "distill_loss": 0.12568268179893494, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "ref_ce_loss": 0.13568513095378876, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "loss": 0.5882728099822998, + "step": 10550 + }, + { + "ce_loss": 0.12761719524860382, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "distill_loss": 0.14731423556804657, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "ref_ce_loss": 0.05525919422507286, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "loss": 0.4472968578338623, + "step": 10550 + }, + { + "ce_loss": 0.14880354702472687, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "distill_loss": 0.16218119859695435, + "epoch": 3.5190126751167448, + "step": 10550 + }, + { + "epoch": 3.5190126751167448, + "ref_ce_loss": 0.08404110372066498, + "step": 10550 + }, + { + "epoch": 3.52234823215477, + "loss": 0.5271, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "grad_norm": 2.0116426944732666, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "learning_rate": 0.00018594576005473228, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "loss": 0.33950960636138916, + "step": 10560 + }, + { + "ce_loss": 0.10914910584688187, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "distill_loss": 0.14298668503761292, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "ref_ce_loss": 0.08681947737932205, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "loss": 1.114986538887024, + "step": 10560 + }, + { + "ce_loss": 0.1504950225353241, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "distill_loss": 0.12499750405550003, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "ref_ce_loss": 0.10533667355775833, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "loss": 0.3771263360977173, + "step": 10560 + }, + { + "ce_loss": 0.09664015471935272, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "distill_loss": 0.12093089520931244, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "ref_ce_loss": 0.08612294495105743, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "loss": 0.4830138385295868, + "step": 10560 + }, + { + "ce_loss": 0.12343651801347733, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "distill_loss": 0.17175374925136566, + "epoch": 3.52234823215477, + "step": 10560 + }, + { + "epoch": 3.52234823215477, + "ref_ce_loss": 0.08810058981180191, + "step": 10560 + }, + { + "epoch": 3.5256837891927955, + "loss": 0.5195, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "grad_norm": 3.038257122039795, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "learning_rate": 0.00018574906810612187, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "loss": 0.3658252954483032, + "step": 10570 + }, + { + "ce_loss": 0.13670586049556732, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "distill_loss": 0.1163606196641922, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "ref_ce_loss": 0.08734627068042755, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "loss": 0.37185975909233093, + "step": 10570 + }, + { + "ce_loss": 0.10969644784927368, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "distill_loss": 0.12583912909030914, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "ref_ce_loss": 0.10249805450439453, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "loss": 0.48817941546440125, + "step": 10570 + }, + { + "ce_loss": 0.1386563777923584, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "distill_loss": 0.18367363512516022, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "ref_ce_loss": 0.12123288214206696, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "loss": 0.39616870880126953, + "step": 10570 + }, + { + "ce_loss": 0.0724596455693245, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "distill_loss": 0.1940024048089981, + "epoch": 3.5256837891927955, + "step": 10570 + }, + { + "epoch": 3.5256837891927955, + "ref_ce_loss": 0.09160967916250229, + "step": 10570 + }, + { + "epoch": 3.529019346230821, + "loss": 0.5065, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "grad_norm": 2.6603708267211914, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "learning_rate": 0.0001855523109653131, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "loss": 0.46012818813323975, + "step": 10580 + }, + { + "ce_loss": 0.13441084325313568, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "distill_loss": 0.18987199664115906, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "ref_ce_loss": 0.1355372965335846, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "loss": 0.367043673992157, + "step": 10580 + }, + { + "ce_loss": 0.10793867707252502, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "distill_loss": 0.13718649744987488, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "ref_ce_loss": 0.08491414785385132, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "loss": 0.4276635944843292, + "step": 10580 + }, + { + "ce_loss": 0.1227010190486908, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "distill_loss": 0.16282536089420319, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "ref_ce_loss": 0.08905931562185287, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "loss": 0.573056161403656, + "step": 10580 + }, + { + "ce_loss": 0.1919986456632614, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "distill_loss": 0.17559261620044708, + "epoch": 3.529019346230821, + "step": 10580 + }, + { + "epoch": 3.529019346230821, + "ref_ce_loss": 0.14072009921073914, + "step": 10580 + }, + { + "epoch": 3.532354903268846, + "loss": 0.4826, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "grad_norm": 1.7876719236373901, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "learning_rate": 0.00018535548899111342, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "loss": 0.4782566428184509, + "step": 10590 + }, + { + "ce_loss": 0.10534249991178513, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "distill_loss": 0.14548616111278534, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "ref_ce_loss": 0.08125923573970795, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "loss": 0.4691029489040375, + "step": 10590 + }, + { + "ce_loss": 0.16666607558727264, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "distill_loss": 0.1389254778623581, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "ref_ce_loss": 0.11366435140371323, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "loss": 0.3638518750667572, + "step": 10590 + }, + { + "ce_loss": 0.09428536146879196, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "distill_loss": 0.15336307883262634, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "ref_ce_loss": 0.11602702736854553, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "loss": 0.8029030561447144, + "step": 10590 + }, + { + "ce_loss": 0.23406115174293518, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "distill_loss": 0.14278815686702728, + "epoch": 3.532354903268846, + "step": 10590 + }, + { + "epoch": 3.532354903268846, + "ref_ce_loss": 0.14123186469078064, + "step": 10590 + }, + { + "epoch": 3.5356904603068715, + "loss": 0.5171, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "grad_norm": 4.143054485321045, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "learning_rate": 0.00018515860254244844, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "loss": 0.27573245763778687, + "step": 10600 + }, + { + "ce_loss": 0.08236755430698395, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "distill_loss": 0.09246663004159927, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "ref_ce_loss": 0.06662546843290329, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "loss": 0.510116696357727, + "step": 10600 + }, + { + "ce_loss": 0.17182530462741852, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "distill_loss": 0.118833988904953, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "ref_ce_loss": 0.1660315990447998, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "loss": 0.4465329945087433, + "step": 10600 + }, + { + "ce_loss": 0.1422976702451706, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "distill_loss": 0.12561358511447906, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "ref_ce_loss": 0.06846870481967926, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "loss": 0.8853085041046143, + "step": 10600 + }, + { + "ce_loss": 0.10994315892457962, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "distill_loss": 0.12597742676734924, + "epoch": 3.5356904603068715, + "step": 10600 + }, + { + "epoch": 3.5356904603068715, + "ref_ce_loss": 0.07962527126073837, + "step": 10600 + }, + { + "epoch": 3.539026017344897, + "loss": 0.4569, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "grad_norm": 1.6940289735794067, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "learning_rate": 0.0001849616519783613, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "loss": 0.5881628394126892, + "step": 10610 + }, + { + "ce_loss": 0.16431401669979095, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "distill_loss": 0.14111711084842682, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "ref_ce_loss": 0.08906078338623047, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "loss": 0.31506747007369995, + "step": 10610 + }, + { + "ce_loss": 0.05988002568483353, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "distill_loss": 0.12606583535671234, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "ref_ce_loss": 0.12904591858386993, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "loss": 0.6737792491912842, + "step": 10610 + }, + { + "ce_loss": 0.06801124662160873, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "distill_loss": 0.10389474034309387, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "ref_ce_loss": 0.11164550483226776, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "loss": 0.43432092666625977, + "step": 10610 + }, + { + "ce_loss": 0.12314651161432266, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "distill_loss": 0.10934734344482422, + "epoch": 3.539026017344897, + "step": 10610 + }, + { + "epoch": 3.539026017344897, + "ref_ce_loss": 0.10453308373689651, + "step": 10610 + }, + { + "epoch": 3.542361574382922, + "loss": 0.4952, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "grad_norm": 2.9235355854034424, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "learning_rate": 0.00018476463765801216, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "loss": 0.49110227823257446, + "step": 10620 + }, + { + "ce_loss": 0.10042405873537064, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "distill_loss": 0.1030670553445816, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "ref_ce_loss": 0.09770134836435318, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "loss": 0.35014456510543823, + "step": 10620 + }, + { + "ce_loss": 0.1137394830584526, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "distill_loss": 0.09898874908685684, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "ref_ce_loss": 0.13730838894844055, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "loss": 0.2383749634027481, + "step": 10620 + }, + { + "ce_loss": 0.06447065621614456, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "distill_loss": 0.08753521740436554, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "ref_ce_loss": 0.0862131118774414, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "loss": 0.35761362314224243, + "step": 10620 + }, + { + "ce_loss": 0.07475518435239792, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "distill_loss": 0.12471903860569, + "epoch": 3.542361574382922, + "step": 10620 + }, + { + "epoch": 3.542361574382922, + "ref_ce_loss": 0.0990569218993187, + "step": 10620 + }, + { + "epoch": 3.5456971314209476, + "loss": 0.4151, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "grad_norm": 2.373331069946289, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "learning_rate": 0.00018456755994067758, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "loss": 0.35218167304992676, + "step": 10630 + }, + { + "ce_loss": 0.12417849153280258, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "distill_loss": 0.10794119536876678, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "ref_ce_loss": 0.09236135333776474, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "loss": 0.3189576268196106, + "step": 10630 + }, + { + "ce_loss": 0.057388123124837875, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "distill_loss": 0.08465489745140076, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "ref_ce_loss": 0.051392436027526855, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "loss": 0.43372079730033875, + "step": 10630 + }, + { + "ce_loss": 0.17675387859344482, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "distill_loss": 0.10880421847105026, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "ref_ce_loss": 0.10260260105133057, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "loss": 0.32213619351387024, + "step": 10630 + }, + { + "ce_loss": 0.11360274255275726, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "distill_loss": 0.10571669787168503, + "epoch": 3.5456971314209476, + "step": 10630 + }, + { + "epoch": 3.5456971314209476, + "ref_ce_loss": 0.0778336301445961, + "step": 10630 + }, + { + "epoch": 3.549032688458973, + "loss": 0.4453, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "grad_norm": 2.135650157928467, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "learning_rate": 0.00018437041918574937, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "loss": 0.2195785492658615, + "step": 10640 + }, + { + "ce_loss": 0.04529380798339844, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "distill_loss": 0.0795193761587143, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "ref_ce_loss": 0.0945015624165535, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "loss": 0.4393448531627655, + "step": 10640 + }, + { + "ce_loss": 0.1371816247701645, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "distill_loss": 0.11540985852479935, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "ref_ce_loss": 0.1122361347079277, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "loss": 0.9115976691246033, + "step": 10640 + }, + { + "ce_loss": 0.15142419934272766, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "distill_loss": 0.10086096078157425, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "ref_ce_loss": 0.13035528361797333, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "loss": 0.7502701282501221, + "step": 10640 + }, + { + "ce_loss": 0.14902283251285553, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "distill_loss": 0.11329962313175201, + "epoch": 3.549032688458973, + "step": 10640 + }, + { + "epoch": 3.549032688458973, + "ref_ce_loss": 0.12520849704742432, + "step": 10640 + }, + { + "epoch": 3.5523682454969983, + "loss": 0.4406, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "grad_norm": 2.5156853199005127, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "learning_rate": 0.00018417321575273462, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "loss": 0.2595893442630768, + "step": 10650 + }, + { + "ce_loss": 0.07727423310279846, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "distill_loss": 0.08966349065303802, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "ref_ce_loss": 0.06863352656364441, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "loss": 0.6745221018791199, + "step": 10650 + }, + { + "ce_loss": 0.1965639591217041, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "distill_loss": 0.12107309699058533, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "ref_ce_loss": 0.11794691532850266, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "loss": 0.4867534935474396, + "step": 10650 + }, + { + "ce_loss": 0.19840309023857117, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "distill_loss": 0.12193414568901062, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "ref_ce_loss": 0.11920852214097977, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "loss": 0.2611696720123291, + "step": 10650 + }, + { + "ce_loss": 0.05607572942972183, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "distill_loss": 0.07235915213823318, + "epoch": 3.5523682454969983, + "step": 10650 + }, + { + "epoch": 3.5523682454969983, + "ref_ce_loss": 0.08902841061353683, + "step": 10650 + }, + { + "epoch": 3.5557038025350236, + "loss": 0.4682, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "grad_norm": 1.9032304286956787, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "learning_rate": 0.00018397595000125454, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "loss": 0.5261881947517395, + "step": 10660 + }, + { + "ce_loss": 0.11859346181154251, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "distill_loss": 0.11338892579078674, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "ref_ce_loss": 0.07596824318170547, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "loss": 0.29960116744041443, + "step": 10660 + }, + { + "ce_loss": 0.10883176326751709, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "distill_loss": 0.09616100788116455, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "ref_ce_loss": 0.0670178085565567, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "loss": 0.5896754264831543, + "step": 10660 + }, + { + "ce_loss": 0.16429340839385986, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "distill_loss": 0.09835916757583618, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "ref_ce_loss": 0.08974584937095642, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "loss": 0.281375527381897, + "step": 10660 + }, + { + "ce_loss": 0.07209251821041107, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "distill_loss": 0.10707713663578033, + "epoch": 3.5557038025350236, + "step": 10660 + }, + { + "epoch": 3.5557038025350236, + "ref_ce_loss": 0.10172483325004578, + "step": 10660 + }, + { + "epoch": 3.559039359573049, + "loss": 0.4404, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "grad_norm": 3.276700496673584, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "learning_rate": 0.0001837786222910441, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "loss": 0.46738162636756897, + "step": 10670 + }, + { + "ce_loss": 0.17903101444244385, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "distill_loss": 0.15275321900844574, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "ref_ce_loss": 0.10681027919054031, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "loss": 0.3164319694042206, + "step": 10670 + }, + { + "ce_loss": 0.09890428930521011, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "distill_loss": 0.0927501991391182, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "ref_ce_loss": 0.08335583657026291, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "loss": 0.3328627347946167, + "step": 10670 + }, + { + "ce_loss": 0.08496581763029099, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "distill_loss": 0.1014360636472702, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "ref_ce_loss": 0.095091313123703, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "loss": 0.3671519458293915, + "step": 10670 + }, + { + "ce_loss": 0.14440912008285522, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "distill_loss": 0.1046408861875534, + "epoch": 3.559039359573049, + "step": 10670 + }, + { + "epoch": 3.559039359573049, + "ref_ce_loss": 0.11774665862321854, + "step": 10670 + }, + { + "epoch": 3.5623749166110743, + "loss": 0.454, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "grad_norm": 2.460484504699707, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "learning_rate": 0.00018358123298195119, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "loss": 0.46191108226776123, + "step": 10680 + }, + { + "ce_loss": 0.1277770698070526, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "distill_loss": 0.11336661875247955, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "ref_ce_loss": 0.13188976049423218, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "loss": 0.3886517286300659, + "step": 10680 + }, + { + "ce_loss": 0.1443498581647873, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "distill_loss": 0.12114789336919785, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "ref_ce_loss": 0.0856688991189003, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "loss": 0.4110898971557617, + "step": 10680 + }, + { + "ce_loss": 0.15371674299240112, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "distill_loss": 0.11015059053897858, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "ref_ce_loss": 0.08587423712015152, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "loss": 0.3749980926513672, + "step": 10680 + }, + { + "ce_loss": 0.1017778292298317, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "distill_loss": 0.1276194453239441, + "epoch": 3.5623749166110743, + "step": 10680 + }, + { + "epoch": 3.5623749166110743, + "ref_ce_loss": 0.11321152746677399, + "step": 10680 + }, + { + "epoch": 3.5657104736490997, + "loss": 0.4719, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "grad_norm": 2.446310520172119, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "learning_rate": 0.00018338378243393604, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "loss": 0.44579532742500305, + "step": 10690 + }, + { + "ce_loss": 0.11378941684961319, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "distill_loss": 0.13413788378238678, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "ref_ce_loss": 0.09782540798187256, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "loss": 0.5395914316177368, + "step": 10690 + }, + { + "ce_loss": 0.12870147824287415, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "distill_loss": 0.126537024974823, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "ref_ce_loss": 0.09157304465770721, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "loss": 0.3987729549407959, + "step": 10690 + }, + { + "ce_loss": 0.0682898759841919, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "distill_loss": 0.13631054759025574, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "ref_ce_loss": 0.10172045975923538, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "loss": 0.40862464904785156, + "step": 10690 + }, + { + "ce_loss": 0.13958579301834106, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "distill_loss": 0.10409563779830933, + "epoch": 3.5657104736490997, + "step": 10690 + }, + { + "epoch": 3.5657104736490997, + "ref_ce_loss": 0.09258479624986649, + "step": 10690 + }, + { + "epoch": 3.569046030687125, + "loss": 0.4992, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "grad_norm": 3.553321599960327, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "learning_rate": 0.00018318627100707052, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "loss": 1.3002853393554688, + "step": 10700 + }, + { + "ce_loss": 0.13170751929283142, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "distill_loss": 0.14222148060798645, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "ref_ce_loss": 0.10056454688310623, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "loss": 0.3703250586986542, + "step": 10700 + }, + { + "ce_loss": 0.0983681008219719, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "distill_loss": 0.16033944487571716, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "ref_ce_loss": 0.11107808351516724, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "loss": 0.4253198504447937, + "step": 10700 + }, + { + "ce_loss": 0.15250420570373535, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "distill_loss": 0.1289929747581482, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "ref_ce_loss": 0.1237674355506897, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "loss": 0.5686759352684021, + "step": 10700 + }, + { + "ce_loss": 0.23660250008106232, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "distill_loss": 0.20011036098003387, + "epoch": 3.569046030687125, + "step": 10700 + }, + { + "epoch": 3.569046030687125, + "ref_ce_loss": 0.12985314428806305, + "step": 10700 + }, + { + "epoch": 3.5723815877251504, + "loss": 0.5315, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "grad_norm": 2.264549493789673, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "learning_rate": 0.00018298869906153764, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "loss": 0.48029279708862305, + "step": 10710 + }, + { + "ce_loss": 0.13935871422290802, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "distill_loss": 0.18435510993003845, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "ref_ce_loss": 0.12784120440483093, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "loss": 0.26222920417785645, + "step": 10710 + }, + { + "ce_loss": 0.04506101459264755, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "distill_loss": 0.09920864552259445, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "ref_ce_loss": 0.06999955326318741, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "loss": 0.32495203614234924, + "step": 10710 + }, + { + "ce_loss": 0.0796368196606636, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "distill_loss": 0.117865189909935, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "ref_ce_loss": 0.10748656094074249, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "loss": 0.5730641484260559, + "step": 10710 + }, + { + "ce_loss": 0.20408304035663605, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "distill_loss": 0.18805451691150665, + "epoch": 3.5723815877251504, + "step": 10710 + }, + { + "epoch": 3.5723815877251504, + "ref_ce_loss": 0.13756021857261658, + "step": 10710 + }, + { + "epoch": 3.5757171447631757, + "loss": 0.4969, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "grad_norm": 3.3614892959594727, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "learning_rate": 0.00018279106695763065, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "loss": 0.5536407828330994, + "step": 10720 + }, + { + "ce_loss": 0.18258598446846008, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "distill_loss": 0.18230997025966644, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "ref_ce_loss": 0.14992879331111908, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "loss": 0.37388432025909424, + "step": 10720 + }, + { + "ce_loss": 0.12813162803649902, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "distill_loss": 0.12815801799297333, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "ref_ce_loss": 0.08915966749191284, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "loss": 0.4762488305568695, + "step": 10720 + }, + { + "ce_loss": 0.11369907110929489, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "distill_loss": 0.14350983500480652, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "ref_ce_loss": 0.12805962562561035, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "loss": 0.4970819652080536, + "step": 10720 + }, + { + "ce_loss": 0.10555389523506165, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "distill_loss": 0.14790529012680054, + "epoch": 3.5757171447631757, + "step": 10720 + }, + { + "epoch": 3.5757171447631757, + "ref_ce_loss": 0.06945054978132248, + "step": 10720 + }, + { + "epoch": 3.579052701801201, + "loss": 0.4624, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "grad_norm": 2.028350591659546, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "learning_rate": 0.0001825933750557525, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "loss": 1.064501404762268, + "step": 10730 + }, + { + "ce_loss": 0.26156678795814514, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "distill_loss": 0.19015151262283325, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "ref_ce_loss": 0.14202123880386353, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "loss": 0.721092700958252, + "step": 10730 + }, + { + "ce_loss": 0.10923820734024048, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "distill_loss": 0.159623384475708, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "ref_ce_loss": 0.0903729498386383, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "loss": 0.6637827157974243, + "step": 10730 + }, + { + "ce_loss": 0.12120405584573746, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "distill_loss": 0.19847629964351654, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "ref_ce_loss": 0.09555134922266006, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "loss": 0.47606194019317627, + "step": 10730 + }, + { + "ce_loss": 0.1550755351781845, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "distill_loss": 0.14270076155662537, + "epoch": 3.579052701801201, + "step": 10730 + }, + { + "epoch": 3.579052701801201, + "ref_ce_loss": 0.09250533580780029, + "step": 10730 + }, + { + "epoch": 3.5823882588392264, + "loss": 0.5166, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "grad_norm": 2.4106342792510986, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "learning_rate": 0.00018239562371641537, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "loss": 1.0733962059020996, + "step": 10740 + }, + { + "ce_loss": 0.2264718860387802, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "distill_loss": 0.1669166088104248, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "ref_ce_loss": 0.17645739018917084, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "loss": 0.3027918040752411, + "step": 10740 + }, + { + "ce_loss": 0.08049654960632324, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "distill_loss": 0.07265809178352356, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "ref_ce_loss": 0.10848908871412277, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "loss": 0.45776477456092834, + "step": 10740 + }, + { + "ce_loss": 0.11973146349191666, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "distill_loss": 0.1842823475599289, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "ref_ce_loss": 0.1533636748790741, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "loss": 0.5457181334495544, + "step": 10740 + }, + { + "ce_loss": 0.12401236593723297, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "distill_loss": 0.11291433125734329, + "epoch": 3.5823882588392264, + "step": 10740 + }, + { + "epoch": 3.5823882588392264, + "ref_ce_loss": 0.10846975445747375, + "step": 10740 + }, + { + "epoch": 3.5857238158772518, + "loss": 0.5116, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "grad_norm": 3.506852865219116, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "learning_rate": 0.00018219781330023954, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "loss": 0.7628376483917236, + "step": 10750 + }, + { + "ce_loss": 0.07697892189025879, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "distill_loss": 0.08748073875904083, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "ref_ce_loss": 0.06737169623374939, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "loss": 0.22098787128925323, + "step": 10750 + }, + { + "ce_loss": 0.044790759682655334, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "distill_loss": 0.09421558678150177, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "ref_ce_loss": 0.06399078667163849, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "loss": 0.4772864580154419, + "step": 10750 + }, + { + "ce_loss": 0.17643699049949646, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "distill_loss": 0.14804883301258087, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "ref_ce_loss": 0.11784358322620392, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "loss": 0.3947041928768158, + "step": 10750 + }, + { + "ce_loss": 0.09214789420366287, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "distill_loss": 0.11591991782188416, + "epoch": 3.5857238158772518, + "step": 10750 + }, + { + "epoch": 3.5857238158772518, + "ref_ce_loss": 0.104644276201725, + "step": 10750 + }, + { + "epoch": 3.589059372915277, + "loss": 0.4357, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "grad_norm": 1.944047212600708, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "learning_rate": 0.00018199994416795323, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "loss": 0.47622978687286377, + "step": 10760 + }, + { + "ce_loss": 0.06587539613246918, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "distill_loss": 0.1422610878944397, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "ref_ce_loss": 0.07343024760484695, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "loss": 0.8206377625465393, + "step": 10760 + }, + { + "ce_loss": 0.12380576133728027, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "distill_loss": 0.12928557395935059, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "ref_ce_loss": 0.10670095682144165, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "loss": 0.5720410943031311, + "step": 10760 + }, + { + "ce_loss": 0.10976602137088776, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "distill_loss": 0.10744272172451019, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "ref_ce_loss": 0.0642593652009964, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "loss": 0.4113171398639679, + "step": 10760 + }, + { + "ce_loss": 0.16085822880268097, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "distill_loss": 0.11362794786691666, + "epoch": 3.589059372915277, + "step": 10760 + }, + { + "epoch": 3.589059372915277, + "ref_ce_loss": 0.0914236307144165, + "step": 10760 + }, + { + "epoch": 3.5923949299533025, + "loss": 0.4972, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "grad_norm": 7.0427727699279785, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "learning_rate": 0.0001818020166803918, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "loss": 0.807584285736084, + "step": 10770 + }, + { + "ce_loss": 0.22549563646316528, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "distill_loss": 0.20602241158485413, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "ref_ce_loss": 0.24110868573188782, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "loss": 0.4110437035560608, + "step": 10770 + }, + { + "ce_loss": 0.10992979258298874, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "distill_loss": 0.15975235402584076, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "ref_ce_loss": 0.09724153578281403, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "loss": 0.44220978021621704, + "step": 10770 + }, + { + "ce_loss": 0.1018572449684143, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "distill_loss": 0.16083693504333496, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "ref_ce_loss": 0.08170153200626373, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "loss": 0.46427327394485474, + "step": 10770 + }, + { + "ce_loss": 0.11812613904476166, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "distill_loss": 0.17142073810100555, + "epoch": 3.5923949299533025, + "step": 10770 + }, + { + "epoch": 3.5923949299533025, + "ref_ce_loss": 0.11512557417154312, + "step": 10770 + }, + { + "epoch": 3.595730486991328, + "loss": 0.4756, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "grad_norm": 1.7970086336135864, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "learning_rate": 0.00018160403119849673, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "loss": 0.3645995855331421, + "step": 10780 + }, + { + "ce_loss": 0.1401004046201706, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "distill_loss": 0.11823718249797821, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "ref_ce_loss": 0.07413559406995773, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "loss": 0.5593521595001221, + "step": 10780 + }, + { + "ce_loss": 0.12035626918077469, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "distill_loss": 0.18114838004112244, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "ref_ce_loss": 0.1041061133146286, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "loss": 0.5523062348365784, + "step": 10780 + }, + { + "ce_loss": 0.11224917322397232, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "distill_loss": 0.17238086462020874, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "ref_ce_loss": 0.10551968216896057, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "loss": 0.4836883544921875, + "step": 10780 + }, + { + "ce_loss": 0.09018741548061371, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "distill_loss": 0.1445465087890625, + "epoch": 3.595730486991328, + "step": 10780 + }, + { + "epoch": 3.595730486991328, + "ref_ce_loss": 0.10304789990186691, + "step": 10780 + }, + { + "epoch": 3.599066044029353, + "loss": 0.5156, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "grad_norm": 2.589780330657959, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "learning_rate": 0.00018140598808331557, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "loss": 0.3814545273780823, + "step": 10790 + }, + { + "ce_loss": 0.14167527854442596, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "distill_loss": 0.14271581172943115, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "ref_ce_loss": 0.06693271547555923, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "loss": 0.6055205464363098, + "step": 10790 + }, + { + "ce_loss": 0.19398923218250275, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "distill_loss": 0.1850125938653946, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "ref_ce_loss": 0.12055008858442307, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "loss": 0.4280034005641937, + "step": 10790 + }, + { + "ce_loss": 0.13872362673282623, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "distill_loss": 0.15624187886714935, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "ref_ce_loss": 0.09329959005117416, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "loss": 0.5235930681228638, + "step": 10790 + }, + { + "ce_loss": 0.21662373840808868, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "distill_loss": 0.09369595348834991, + "epoch": 3.599066044029353, + "step": 10790 + }, + { + "epoch": 3.599066044029353, + "ref_ce_loss": 0.09563537687063217, + "step": 10790 + }, + { + "epoch": 3.6024016010673785, + "loss": 0.4679, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "grad_norm": 1.9319144487380981, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "learning_rate": 0.0001812078876960008, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "loss": 0.2767658829689026, + "step": 10800 + }, + { + "ce_loss": 0.07563504576683044, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "distill_loss": 0.13797806203365326, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "ref_ce_loss": 0.06295233219861984, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "loss": 0.3499141037464142, + "step": 10800 + }, + { + "ce_loss": 0.10508345067501068, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "distill_loss": 0.1375848948955536, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "ref_ce_loss": 0.0647381991147995, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "loss": 0.5485572814941406, + "step": 10800 + }, + { + "ce_loss": 0.1507655829191208, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "distill_loss": 0.1268126517534256, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "ref_ce_loss": 0.12260568141937256, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "loss": 0.6075488328933716, + "step": 10800 + }, + { + "ce_loss": 0.16358868777751923, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "distill_loss": 0.1781436651945114, + "epoch": 3.6024016010673785, + "step": 10800 + }, + { + "epoch": 3.6024016010673785, + "ref_ce_loss": 0.13535261154174805, + "step": 10800 + }, + { + "epoch": 3.605737158105404, + "loss": 0.4998, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "grad_norm": 5.021475791931152, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "learning_rate": 0.00018100973039780933, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "loss": 0.5075588226318359, + "step": 10810 + }, + { + "ce_loss": 0.15543168783187866, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "distill_loss": 0.14139851927757263, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "ref_ce_loss": 0.12484012544155121, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "loss": 0.26541557908058167, + "step": 10810 + }, + { + "ce_loss": 0.08082418888807297, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "distill_loss": 0.13024282455444336, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "ref_ce_loss": 0.054262448102235794, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "loss": 0.4680088758468628, + "step": 10810 + }, + { + "ce_loss": 0.1070551946759224, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "distill_loss": 0.15847979485988617, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "ref_ce_loss": 0.12366623431444168, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "loss": 0.5198716521263123, + "step": 10810 + }, + { + "ce_loss": 0.18975123763084412, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "distill_loss": 0.1683826595544815, + "epoch": 3.605737158105404, + "step": 10810 + }, + { + "epoch": 3.605737158105404, + "ref_ce_loss": 0.1616154909133911, + "step": 10810 + }, + { + "epoch": 3.609072715143429, + "loss": 0.4423, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "grad_norm": 1.987995982170105, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "learning_rate": 0.00018081151655010202, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "loss": 0.31308069825172424, + "step": 10820 + }, + { + "ce_loss": 0.10505445301532745, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "distill_loss": 0.11159270256757736, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "ref_ce_loss": 0.09632076323032379, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "loss": 0.47415441274642944, + "step": 10820 + }, + { + "ce_loss": 0.04985740780830383, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "distill_loss": 0.12312208116054535, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "ref_ce_loss": 0.0715782567858696, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "loss": 0.4616354703903198, + "step": 10820 + }, + { + "ce_loss": 0.11249656975269318, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "distill_loss": 0.12125895917415619, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "ref_ce_loss": 0.09925705194473267, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "loss": 0.34156709909439087, + "step": 10820 + }, + { + "ce_loss": 0.06891462951898575, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "distill_loss": 0.13339316844940186, + "epoch": 3.609072715143429, + "step": 10820 + }, + { + "epoch": 3.609072715143429, + "ref_ce_loss": 0.07170667499303818, + "step": 10820 + }, + { + "epoch": 3.6124082721814545, + "loss": 0.4969, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "grad_norm": 2.7403793334960938, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "learning_rate": 0.00018061324651434267, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "loss": 0.471358984708786, + "step": 10830 + }, + { + "ce_loss": 0.10615131258964539, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "distill_loss": 0.14862710237503052, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "ref_ce_loss": 0.10827454179525375, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "loss": 0.3429005742073059, + "step": 10830 + }, + { + "ce_loss": 0.07791846245527267, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "distill_loss": 0.09560949355363846, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "ref_ce_loss": 0.10049939155578613, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "loss": 0.3915432095527649, + "step": 10830 + }, + { + "ce_loss": 0.12821824848651886, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "distill_loss": 0.12204201519489288, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "ref_ce_loss": 0.09869548678398132, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "loss": 0.320291131734848, + "step": 10830 + }, + { + "ce_loss": 0.08379454910755157, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "distill_loss": 0.14412151277065277, + "epoch": 3.6124082721814545, + "step": 10830 + }, + { + "epoch": 3.6124082721814545, + "ref_ce_loss": 0.09217362850904465, + "step": 10830 + }, + { + "epoch": 3.61574382921948, + "loss": 0.4848, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "grad_norm": 2.4884891510009766, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "learning_rate": 0.00018041492065209755, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "loss": 0.277363121509552, + "step": 10840 + }, + { + "ce_loss": 0.0627259761095047, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "distill_loss": 0.12686988711357117, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "ref_ce_loss": 0.054359905421733856, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "loss": 0.41814225912094116, + "step": 10840 + }, + { + "ce_loss": 0.13269467651844025, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "distill_loss": 0.14560770988464355, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "ref_ce_loss": 0.0991872251033783, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "loss": 0.3902513384819031, + "step": 10840 + }, + { + "ce_loss": 0.11417891830205917, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "distill_loss": 0.1708763986825943, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "ref_ce_loss": 0.10510201752185822, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "loss": 0.4186701476573944, + "step": 10840 + }, + { + "ce_loss": 0.0700027123093605, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "distill_loss": 0.1922881007194519, + "epoch": 3.61574382921948, + "step": 10840 + }, + { + "epoch": 3.61574382921948, + "ref_ce_loss": 0.0704408511519432, + "step": 10840 + }, + { + "epoch": 3.6190793862575052, + "loss": 0.4628, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "grad_norm": 5.111220359802246, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "learning_rate": 0.00018021653932503493, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "loss": 0.4838990569114685, + "step": 10850 + }, + { + "ce_loss": 0.13407105207443237, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "distill_loss": 0.14027553796768188, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "ref_ce_loss": 0.11734570562839508, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "loss": 0.6074638366699219, + "step": 10850 + }, + { + "ce_loss": 0.17922604084014893, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "distill_loss": 0.1384764015674591, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "ref_ce_loss": 0.12209158390760422, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "loss": 0.5015256404876709, + "step": 10850 + }, + { + "ce_loss": 0.14106027781963348, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "distill_loss": 0.17080964148044586, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "ref_ce_loss": 0.08879175037145615, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "loss": 0.42805197834968567, + "step": 10850 + }, + { + "ce_loss": 0.15597136318683624, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "distill_loss": 0.10954298824071884, + "epoch": 3.6190793862575052, + "step": 10850 + }, + { + "epoch": 3.6190793862575052, + "ref_ce_loss": 0.1303330808877945, + "step": 10850 + }, + { + "epoch": 3.6224149432955306, + "loss": 0.4698, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "grad_norm": 2.6651175022125244, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "learning_rate": 0.00018001810289492405, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "loss": 0.534658670425415, + "step": 10860 + }, + { + "ce_loss": 0.044203534722328186, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "distill_loss": 0.14794570207595825, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "ref_ce_loss": 0.08161613345146179, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "loss": 0.7004905939102173, + "step": 10860 + }, + { + "ce_loss": 0.11076775193214417, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "distill_loss": 0.17937146127223969, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "ref_ce_loss": 0.12348173558712006, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "loss": 0.460467666387558, + "step": 10860 + }, + { + "ce_loss": 0.12757205963134766, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "distill_loss": 0.13403235375881195, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "ref_ce_loss": 0.15560974180698395, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "loss": 0.401950478553772, + "step": 10860 + }, + { + "ce_loss": 0.09800982475280762, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "distill_loss": 0.15696462988853455, + "epoch": 3.6224149432955306, + "step": 10860 + }, + { + "epoch": 3.6224149432955306, + "ref_ce_loss": 0.07699155062437057, + "step": 10860 + }, + { + "epoch": 3.625750500333556, + "loss": 0.457, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "grad_norm": 2.0317394733428955, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "learning_rate": 0.00017981961172363462, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "loss": 0.3700401782989502, + "step": 10870 + }, + { + "ce_loss": 0.10517442226409912, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "distill_loss": 0.12143470346927643, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "ref_ce_loss": 0.09167401492595673, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "loss": 0.4842488169670105, + "step": 10870 + }, + { + "ce_loss": 0.1177164688706398, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "distill_loss": 0.1058540940284729, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "ref_ce_loss": 0.08583217859268188, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "loss": 0.5331366658210754, + "step": 10870 + }, + { + "ce_loss": 0.13608968257904053, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "distill_loss": 0.19230999052524567, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "ref_ce_loss": 0.09139268100261688, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "loss": 1.190303921699524, + "step": 10870 + }, + { + "ce_loss": 0.25734570622444153, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "distill_loss": 0.16222265362739563, + "epoch": 3.625750500333556, + "step": 10870 + }, + { + "epoch": 3.625750500333556, + "ref_ce_loss": 0.18444815278053284, + "step": 10870 + }, + { + "epoch": 3.6290860573715813, + "loss": 0.4817, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "grad_norm": 2.306195020675659, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "learning_rate": 0.00017962106617313626, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "loss": 0.6413604617118835, + "step": 10880 + }, + { + "ce_loss": 0.22188203036785126, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "distill_loss": 0.21289722621440887, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "ref_ce_loss": 0.20646364986896515, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "loss": 0.36448946595191956, + "step": 10880 + }, + { + "ce_loss": 0.08776570856571198, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "distill_loss": 0.12724003195762634, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "ref_ce_loss": 0.10194741189479828, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "loss": 0.472312331199646, + "step": 10880 + }, + { + "ce_loss": 0.16327980160713196, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "distill_loss": 0.16509795188903809, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "ref_ce_loss": 0.10424279421567917, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "loss": 0.40606415271759033, + "step": 10880 + }, + { + "ce_loss": 0.047251492738723755, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "distill_loss": 0.1290288269519806, + "epoch": 3.6290860573715813, + "step": 10880 + }, + { + "epoch": 3.6290860573715813, + "ref_ce_loss": 0.09942927211523056, + "step": 10880 + }, + { + "epoch": 3.6324216144096066, + "loss": 0.4377, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "grad_norm": 2.150160789489746, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "learning_rate": 0.0001794224666054978, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "loss": 0.3624217212200165, + "step": 10890 + }, + { + "ce_loss": 0.11279918998479843, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "distill_loss": 0.12640734016895294, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "ref_ce_loss": 0.12293494492769241, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "loss": 0.40093767642974854, + "step": 10890 + }, + { + "ce_loss": 0.1267414391040802, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "distill_loss": 0.10430145263671875, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "ref_ce_loss": 0.07403694093227386, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "loss": 0.41296109557151794, + "step": 10890 + }, + { + "ce_loss": 0.1552402824163437, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "distill_loss": 0.089487724006176, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "ref_ce_loss": 0.13549669086933136, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "loss": 0.5205192565917969, + "step": 10890 + }, + { + "ce_loss": 0.1464664340019226, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "distill_loss": 0.1508297473192215, + "epoch": 3.6324216144096066, + "step": 10890 + }, + { + "epoch": 3.6324216144096066, + "ref_ce_loss": 0.11153878271579742, + "step": 10890 + }, + { + "epoch": 3.635757171447632, + "loss": 0.4605, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "grad_norm": 6.012615203857422, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "learning_rate": 0.00017922381338288646, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "loss": 0.835310161113739, + "step": 10900 + }, + { + "ce_loss": 0.2168867439031601, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "distill_loss": 0.21579332649707794, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "ref_ce_loss": 0.15783701837062836, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "loss": 0.4475416839122772, + "step": 10900 + }, + { + "ce_loss": 0.18404029309749603, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "distill_loss": 0.13007982075214386, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "ref_ce_loss": 0.10602632164955139, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "loss": 0.3571226894855499, + "step": 10900 + }, + { + "ce_loss": 0.08436594903469086, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "distill_loss": 0.12900224328041077, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "ref_ce_loss": 0.09496305882930756, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "loss": 0.5006234645843506, + "step": 10900 + }, + { + "ce_loss": 0.10648602992296219, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "distill_loss": 0.17396169900894165, + "epoch": 3.635757171447632, + "step": 10900 + }, + { + "epoch": 3.635757171447632, + "ref_ce_loss": 0.14211241900920868, + "step": 10900 + }, + { + "epoch": 3.6390927284856573, + "loss": 0.5126, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "grad_norm": 2.959005355834961, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "learning_rate": 0.00017902510686756737, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "loss": 0.6385618448257446, + "step": 10910 + }, + { + "ce_loss": 0.21390993893146515, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "distill_loss": 0.2231408804655075, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "ref_ce_loss": 0.12439989298582077, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "loss": 0.21764978766441345, + "step": 10910 + }, + { + "ce_loss": 0.04720484837889671, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "distill_loss": 0.09097013622522354, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "ref_ce_loss": 0.058279022574424744, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "loss": 0.4075714349746704, + "step": 10910 + }, + { + "ce_loss": 0.10580960661172867, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "distill_loss": 0.2019980251789093, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "ref_ce_loss": 0.09952415525913239, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "loss": 0.4693651497364044, + "step": 10910 + }, + { + "ce_loss": 0.1692083179950714, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "distill_loss": 0.1516937017440796, + "epoch": 3.6390927284856573, + "step": 10910 + }, + { + "epoch": 3.6390927284856573, + "ref_ce_loss": 0.08030670136213303, + "step": 10910 + }, + { + "epoch": 3.6424282855236827, + "loss": 0.5253, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "grad_norm": 2.7803750038146973, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "learning_rate": 0.00017882634742190278, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "loss": 0.6413553953170776, + "step": 10920 + }, + { + "ce_loss": 0.13203197717666626, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "distill_loss": 0.1417599618434906, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "ref_ce_loss": 0.09080624580383301, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "loss": 0.4760216474533081, + "step": 10920 + }, + { + "ce_loss": 0.09282150864601135, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "distill_loss": 0.14410977065563202, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "ref_ce_loss": 0.07130226492881775, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "loss": 0.38041651248931885, + "step": 10920 + }, + { + "ce_loss": 0.04239816591143608, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "distill_loss": 0.15716636180877686, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "ref_ce_loss": 0.06270455569028854, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "loss": 0.5958917140960693, + "step": 10920 + }, + { + "ce_loss": 0.12721644341945648, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "distill_loss": 0.19002792239189148, + "epoch": 3.6424282855236827, + "step": 10920 + }, + { + "epoch": 3.6424282855236827, + "ref_ce_loss": 0.13948236405849457, + "step": 10920 + }, + { + "epoch": 3.645763842561708, + "loss": 0.4639, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "grad_norm": 2.47678804397583, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "learning_rate": 0.0001786275354083516, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "loss": 0.567202627658844, + "step": 10930 + }, + { + "ce_loss": 0.11808883398771286, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "distill_loss": 0.15721075236797333, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "ref_ce_loss": 0.06056002900004387, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "loss": 0.49473297595977783, + "step": 10930 + }, + { + "ce_loss": 0.15821705758571625, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "distill_loss": 0.1436956524848938, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "ref_ce_loss": 0.10956276953220367, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "loss": 0.4342118203639984, + "step": 10930 + }, + { + "ce_loss": 0.14422941207885742, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "distill_loss": 0.17922310531139374, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "ref_ce_loss": 0.08042961359024048, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "loss": 0.49232393503189087, + "step": 10930 + }, + { + "ce_loss": 0.14703483879566193, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "distill_loss": 0.15014778077602386, + "epoch": 3.645763842561708, + "step": 10930 + }, + { + "epoch": 3.645763842561708, + "ref_ce_loss": 0.10249628871679306, + "step": 10930 + }, + { + "epoch": 3.6490993995997334, + "loss": 0.4804, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "grad_norm": 3.3948302268981934, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "learning_rate": 0.0001784286711894685, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "loss": 0.5856025218963623, + "step": 10940 + }, + { + "ce_loss": 0.15770310163497925, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "distill_loss": 0.16146592795848846, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "ref_ce_loss": 0.13260933756828308, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "loss": 0.3507378399372101, + "step": 10940 + }, + { + "ce_loss": 0.07525129616260529, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "distill_loss": 0.1550418585538864, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "ref_ce_loss": 0.09258686006069183, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "loss": 0.2977530360221863, + "step": 10940 + }, + { + "ce_loss": 0.06264433264732361, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "distill_loss": 0.1284417361021042, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "ref_ce_loss": 0.06895040720701218, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "loss": 0.28779518604278564, + "step": 10940 + }, + { + "ce_loss": 0.05994941294193268, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "distill_loss": 0.1388159692287445, + "epoch": 3.6490993995997334, + "step": 10940 + }, + { + "epoch": 3.6490993995997334, + "ref_ce_loss": 0.060144439339637756, + "step": 10940 + }, + { + "epoch": 3.6524349566377587, + "loss": 0.5031, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "grad_norm": 5.514958381652832, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "learning_rate": 0.0001782297551279033, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "loss": 0.49093204736709595, + "step": 10950 + }, + { + "ce_loss": 0.15218622982501984, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "distill_loss": 0.11811237782239914, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "ref_ce_loss": 0.0893976241350174, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "loss": 0.3062590956687927, + "step": 10950 + }, + { + "ce_loss": 0.04653836041688919, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "distill_loss": 0.10441438853740692, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "ref_ce_loss": 0.08877456188201904, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "loss": 0.6125166416168213, + "step": 10950 + }, + { + "ce_loss": 0.117793507874012, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "distill_loss": 0.1392459124326706, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "ref_ce_loss": 0.12085837125778198, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "loss": 0.5856779217720032, + "step": 10950 + }, + { + "ce_loss": 0.17838098108768463, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "distill_loss": 0.13888207077980042, + "epoch": 3.6524349566377587, + "step": 10950 + }, + { + "epoch": 3.6524349566377587, + "ref_ce_loss": 0.09558483958244324, + "step": 10950 + }, + { + "epoch": 3.655770513675784, + "loss": 0.4924, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "grad_norm": 3.244112730026245, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "learning_rate": 0.00017803078758640053, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "loss": 0.37806224822998047, + "step": 10960 + }, + { + "ce_loss": 0.1090494692325592, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "distill_loss": 0.13142424821853638, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "ref_ce_loss": 0.09739561378955841, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "loss": 0.594853937625885, + "step": 10960 + }, + { + "ce_loss": 0.09211955219507217, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "distill_loss": 0.12220696359872818, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "ref_ce_loss": 0.0728682428598404, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "loss": 0.22912947833538055, + "step": 10960 + }, + { + "ce_loss": 0.05628032609820366, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "distill_loss": 0.11781372129917145, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "ref_ce_loss": 0.054764240980148315, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "loss": 0.2798725962638855, + "step": 10960 + }, + { + "ce_loss": 0.09618540853261948, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "distill_loss": 0.11366327106952667, + "epoch": 3.655770513675784, + "step": 10960 + }, + { + "epoch": 3.655770513675784, + "ref_ce_loss": 0.05259474366903305, + "step": 10960 + }, + { + "epoch": 3.6591060707138094, + "loss": 0.4712, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "grad_norm": 3.9033915996551514, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "learning_rate": 0.00017783176892779834, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "loss": 0.506156861782074, + "step": 10970 + }, + { + "ce_loss": 0.09005085378885269, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "distill_loss": 0.10344702750444412, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "ref_ce_loss": 0.09592948108911514, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "loss": 0.4516763985157013, + "step": 10970 + }, + { + "ce_loss": 0.1697418987751007, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "distill_loss": 0.15038640797138214, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "ref_ce_loss": 0.13058653473854065, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "loss": 0.3005906939506531, + "step": 10970 + }, + { + "ce_loss": 0.07280378043651581, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "distill_loss": 0.13411296904087067, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "ref_ce_loss": 0.09324061870574951, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "loss": 0.4326677918434143, + "step": 10970 + }, + { + "ce_loss": 0.13207785785198212, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "distill_loss": 0.1927163451910019, + "epoch": 3.6591060707138094, + "step": 10970 + }, + { + "epoch": 3.6591060707138094, + "ref_ce_loss": 0.10759655386209488, + "step": 10970 + }, + { + "epoch": 3.662441627751835, + "loss": 0.556, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "grad_norm": 2.6602776050567627, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "learning_rate": 0.00017763269951502844, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "loss": 0.4683450162410736, + "step": 10980 + }, + { + "ce_loss": 0.14464636147022247, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "distill_loss": 0.16256146132946014, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "ref_ce_loss": 0.1233372688293457, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "loss": 0.4439935088157654, + "step": 10980 + }, + { + "ce_loss": 0.15323685109615326, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "distill_loss": 0.16922098398208618, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "ref_ce_loss": 0.09869551658630371, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "loss": 0.4825940728187561, + "step": 10980 + }, + { + "ce_loss": 0.14853030443191528, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "distill_loss": 0.2026570588350296, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "ref_ce_loss": 0.08260758221149445, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "loss": 0.3823135793209076, + "step": 10980 + }, + { + "ce_loss": 0.09527653455734253, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "distill_loss": 0.1679757833480835, + "epoch": 3.662441627751835, + "step": 10980 + }, + { + "epoch": 3.662441627751835, + "ref_ce_loss": 0.0711362436413765, + "step": 10980 + }, + { + "epoch": 3.66577718478986, + "loss": 0.5047, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "grad_norm": 4.2559494972229, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "learning_rate": 0.00017743357971111487, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "loss": 0.34644657373428345, + "step": 10990 + }, + { + "ce_loss": 0.12960556149482727, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "distill_loss": 0.12763711810112, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "ref_ce_loss": 0.08901886641979218, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "loss": 0.41295936703681946, + "step": 10990 + }, + { + "ce_loss": 0.10422497987747192, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "distill_loss": 0.10833515971899033, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "ref_ce_loss": 0.12332861870527267, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "loss": 0.6116182804107666, + "step": 10990 + }, + { + "ce_loss": 0.13335031270980835, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "distill_loss": 0.20115993916988373, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "ref_ce_loss": 0.11726392805576324, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "loss": 1.1295512914657593, + "step": 10990 + }, + { + "ce_loss": 0.18608838319778442, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "distill_loss": 0.2441026270389557, + "epoch": 3.66577718478986, + "step": 10990 + }, + { + "epoch": 3.66577718478986, + "ref_ce_loss": 0.11386443674564362, + "step": 10990 + }, + { + "epoch": 3.6691127418278855, + "loss": 0.4915, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "grad_norm": 2.3816754817962646, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "learning_rate": 0.00017723440987917353, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "loss": 0.4174056053161621, + "step": 11000 + }, + { + "ce_loss": 0.12153197824954987, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "distill_loss": 0.12821528315544128, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "ref_ce_loss": 0.08482295274734497, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "loss": 0.45367807149887085, + "step": 11000 + }, + { + "ce_loss": 0.1108308881521225, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "distill_loss": 0.14451178908348083, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "ref_ce_loss": 0.1315654218196869, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "loss": 0.4495985507965088, + "step": 11000 + }, + { + "ce_loss": 0.09988871216773987, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "distill_loss": 0.13498930633068085, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "ref_ce_loss": 0.09648939967155457, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "loss": 0.48693129420280457, + "step": 11000 + }, + { + "ce_loss": 0.19086134433746338, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "distill_loss": 0.1703328639268875, + "epoch": 3.6691127418278855, + "step": 11000 + }, + { + "epoch": 3.6691127418278855, + "ref_ce_loss": 0.12523435056209564, + "step": 11000 + }, + { + "epoch": 3.672448298865911, + "loss": 0.5174, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "grad_norm": 2.777346611022949, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "learning_rate": 0.0001770351903824116, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "loss": 0.39073672890663147, + "step": 11010 + }, + { + "ce_loss": 0.1334664672613144, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "distill_loss": 0.14954569935798645, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "ref_ce_loss": 0.10750074684619904, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "loss": 0.4113316833972931, + "step": 11010 + }, + { + "ce_loss": 0.06988398730754852, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "distill_loss": 0.16288743913173676, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "ref_ce_loss": 0.08769288659095764, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "loss": 0.6188124418258667, + "step": 11010 + }, + { + "ce_loss": 0.1457642912864685, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "distill_loss": 0.15776768326759338, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "ref_ce_loss": 0.11883719265460968, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "loss": 0.531037449836731, + "step": 11010 + }, + { + "ce_loss": 0.14301258325576782, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "distill_loss": 0.1943218857049942, + "epoch": 3.672448298865911, + "step": 11010 + }, + { + "epoch": 3.672448298865911, + "ref_ce_loss": 0.08588794618844986, + "step": 11010 + }, + { + "epoch": 3.675783855903936, + "loss": 0.4489, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "grad_norm": 2.049055814743042, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "learning_rate": 0.00017683592158412704, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "loss": 0.5385304689407349, + "step": 11020 + }, + { + "ce_loss": 0.08100423961877823, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "distill_loss": 0.1679883748292923, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "ref_ce_loss": 0.10024969279766083, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "loss": 0.7263086438179016, + "step": 11020 + }, + { + "ce_loss": 0.09569170325994492, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "distill_loss": 0.18969206511974335, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "ref_ce_loss": 0.11134276539087296, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "loss": 0.8118753433227539, + "step": 11020 + }, + { + "ce_loss": 0.200321227312088, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "distill_loss": 0.20911133289337158, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "ref_ce_loss": 0.11153283715248108, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "loss": 0.4810273051261902, + "step": 11020 + }, + { + "ce_loss": 0.08565720915794373, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "distill_loss": 0.13249194622039795, + "epoch": 3.675783855903936, + "step": 11020 + }, + { + "epoch": 3.675783855903936, + "ref_ce_loss": 0.11665981262922287, + "step": 11020 + }, + { + "epoch": 3.6791194129419615, + "loss": 0.5263, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "grad_norm": 12.643746376037598, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "learning_rate": 0.00017663660384770739, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "loss": 0.8190609216690063, + "step": 11030 + }, + { + "ce_loss": 0.1600896418094635, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "distill_loss": 0.19733691215515137, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "ref_ce_loss": 0.14073491096496582, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "loss": 1.0265297889709473, + "step": 11030 + }, + { + "ce_loss": 0.22034034132957458, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "distill_loss": 0.19493655860424042, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "ref_ce_loss": 0.18260297179222107, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "loss": 0.36415034532546997, + "step": 11030 + }, + { + "ce_loss": 0.07882717996835709, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "distill_loss": 0.17585335671901703, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "ref_ce_loss": 0.06001151353120804, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "loss": 0.47990405559539795, + "step": 11030 + }, + { + "ce_loss": 0.1564231514930725, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "distill_loss": 0.173221617937088, + "epoch": 3.6791194129419615, + "step": 11030 + }, + { + "epoch": 3.6791194129419615, + "ref_ce_loss": 0.07495393604040146, + "step": 11030 + }, + { + "epoch": 3.682454969979987, + "loss": 0.5324, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "grad_norm": 3.118147134780884, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "learning_rate": 0.00017643723753662954, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "loss": 0.5376815795898438, + "step": 11040 + }, + { + "ce_loss": 0.16959786415100098, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "distill_loss": 0.16002260148525238, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "ref_ce_loss": 0.1409972459077835, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "loss": 0.5773664116859436, + "step": 11040 + }, + { + "ce_loss": 0.1668001413345337, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "distill_loss": 0.15590955317020416, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "ref_ce_loss": 0.11919999867677689, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "loss": 0.45798033475875854, + "step": 11040 + }, + { + "ce_loss": 0.13292302191257477, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "distill_loss": 0.16225001215934753, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "ref_ce_loss": 0.08494387567043304, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "loss": 0.5558249950408936, + "step": 11040 + }, + { + "ce_loss": 0.12711194157600403, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "distill_loss": 0.21948517858982086, + "epoch": 3.682454969979987, + "step": 11040 + }, + { + "epoch": 3.682454969979987, + "ref_ce_loss": 0.0999494194984436, + "step": 11040 + }, + { + "epoch": 3.6857905270180122, + "loss": 0.5009, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "grad_norm": 4.109448432922363, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "learning_rate": 0.00017623782301445917, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "loss": 0.43522462248802185, + "step": 11050 + }, + { + "ce_loss": 0.11968658119440079, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "distill_loss": 0.13889183104038239, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "ref_ce_loss": 0.08983059227466583, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "loss": 0.6420830488204956, + "step": 11050 + }, + { + "ce_loss": 0.15069016814231873, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "distill_loss": 0.21941916644573212, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "ref_ce_loss": 0.1224115639925003, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "loss": 0.3963969647884369, + "step": 11050 + }, + { + "ce_loss": 0.09678657352924347, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "distill_loss": 0.09813752770423889, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "ref_ce_loss": 0.08016207069158554, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "loss": 0.21546831727027893, + "step": 11050 + }, + { + "ce_loss": 0.027873264625668526, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "distill_loss": 0.10052196681499481, + "epoch": 3.6857905270180122, + "step": 11050 + }, + { + "epoch": 3.6857905270180122, + "ref_ce_loss": 0.05747190862894058, + "step": 11050 + }, + { + "epoch": 3.6891260840560376, + "loss": 0.4898, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "grad_norm": 2.2396340370178223, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "learning_rate": 0.00017603836064484949, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "loss": 0.16668157279491425, + "step": 11060 + }, + { + "ce_loss": 0.011731747537851334, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "distill_loss": 0.09391668438911438, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "ref_ce_loss": 0.06087285652756691, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "loss": 0.36854448914527893, + "step": 11060 + }, + { + "ce_loss": 0.108416847884655, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "distill_loss": 0.13391919434070587, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "ref_ce_loss": 0.08919122070074081, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "loss": 0.28041553497314453, + "step": 11060 + }, + { + "ce_loss": 0.05195368081331253, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "distill_loss": 0.07924183458089828, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "ref_ce_loss": 0.09253819286823273, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "loss": 0.3154188096523285, + "step": 11060 + }, + { + "ce_loss": 0.10825911909341812, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "distill_loss": 0.11209943145513535, + "epoch": 3.6891260840560376, + "step": 11060 + }, + { + "epoch": 3.6891260840560376, + "ref_ce_loss": 0.09472549706697464, + "step": 11060 + }, + { + "epoch": 3.692461641094063, + "loss": 0.5081, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "grad_norm": 4.824976921081543, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "learning_rate": 0.0001758388507915413, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "loss": 0.5516034364700317, + "step": 11070 + }, + { + "ce_loss": 0.21088926494121552, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "distill_loss": 0.1855248510837555, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "ref_ce_loss": 0.11560109257698059, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "loss": 0.32801610231399536, + "step": 11070 + }, + { + "ce_loss": 0.04863592982292175, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "distill_loss": 0.14596258103847504, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "ref_ce_loss": 0.06314782798290253, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "loss": 0.583217203617096, + "step": 11070 + }, + { + "ce_loss": 0.15474650263786316, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "distill_loss": 0.19245094060897827, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "ref_ce_loss": 0.09346684068441391, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "loss": 0.4624575972557068, + "step": 11070 + }, + { + "ce_loss": 0.1203080266714096, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "distill_loss": 0.12611785531044006, + "epoch": 3.692461641094063, + "step": 11070 + }, + { + "epoch": 3.692461641094063, + "ref_ce_loss": 0.13801667094230652, + "step": 11070 + }, + { + "epoch": 3.6957971981320883, + "loss": 0.4731, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "grad_norm": 3.346745014190674, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "learning_rate": 0.00017563929381836192, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "loss": 0.3251184821128845, + "step": 11080 + }, + { + "ce_loss": 0.1268000304698944, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "distill_loss": 0.10941871255636215, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "ref_ce_loss": 0.0887828916311264, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "loss": 0.41327232122421265, + "step": 11080 + }, + { + "ce_loss": 0.1111992746591568, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "distill_loss": 0.11428511887788773, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "ref_ce_loss": 0.08257275819778442, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "loss": 0.5031070113182068, + "step": 11080 + }, + { + "ce_loss": 0.16362826526165009, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "distill_loss": 0.1515190601348877, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "ref_ce_loss": 0.15207554399967194, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "loss": 0.7305903434753418, + "step": 11080 + }, + { + "ce_loss": 0.13129164278507233, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "distill_loss": 0.181007519364357, + "epoch": 3.6957971981320883, + "step": 11080 + }, + { + "epoch": 3.6957971981320883, + "ref_ce_loss": 0.08313669264316559, + "step": 11080 + }, + { + "epoch": 3.6991327551701136, + "loss": 0.5314, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "grad_norm": 4.584631443023682, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "learning_rate": 0.00017543969008922448, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "loss": 0.6552540063858032, + "step": 11090 + }, + { + "ce_loss": 0.1395750343799591, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "distill_loss": 0.22694694995880127, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "ref_ce_loss": 0.12240070104598999, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "loss": 0.37868496775627136, + "step": 11090 + }, + { + "ce_loss": 0.13656875491142273, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "distill_loss": 0.1315259039402008, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "ref_ce_loss": 0.0780695304274559, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "loss": 0.7789294719696045, + "step": 11090 + }, + { + "ce_loss": 0.15391351282596588, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "distill_loss": 0.2258004993200302, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "ref_ce_loss": 0.1496977061033249, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "loss": 0.5458284020423889, + "step": 11090 + }, + { + "ce_loss": 0.08628889918327332, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "distill_loss": 0.11481862515211105, + "epoch": 3.6991327551701136, + "step": 11090 + }, + { + "epoch": 3.6991327551701136, + "ref_ce_loss": 0.08423425257205963, + "step": 11090 + }, + { + "epoch": 3.702468312208139, + "loss": 0.4868, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "grad_norm": 2.533743381500244, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "learning_rate": 0.00017524003996812742, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "loss": 0.7007694840431213, + "step": 11100 + }, + { + "ce_loss": 0.29086416959762573, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "distill_loss": 0.20586749911308289, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "ref_ce_loss": 0.0947490781545639, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "loss": 0.8538395166397095, + "step": 11100 + }, + { + "ce_loss": 0.1478804051876068, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "distill_loss": 0.14652153849601746, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "ref_ce_loss": 0.11311804503202438, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "loss": 0.3475764989852905, + "step": 11100 + }, + { + "ce_loss": 0.054964080452919006, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "distill_loss": 0.1031276285648346, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "ref_ce_loss": 0.0805249959230423, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "loss": 0.28178277611732483, + "step": 11100 + }, + { + "ce_loss": 0.040550194680690765, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "distill_loss": 0.12443134188652039, + "epoch": 3.702468312208139, + "step": 11100 + }, + { + "epoch": 3.702468312208139, + "ref_ce_loss": 0.08089882880449295, + "step": 11100 + }, + { + "epoch": 3.7058038692461643, + "loss": 0.4953, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "grad_norm": 4.474017143249512, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "learning_rate": 0.00017504034381915387, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "loss": 0.2509787678718567, + "step": 11110 + }, + { + "ce_loss": 0.06018523499369621, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "distill_loss": 0.08989400416612625, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "ref_ce_loss": 0.06109092757105827, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "loss": 0.6448087096214294, + "step": 11110 + }, + { + "ce_loss": 0.10397692024707794, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "distill_loss": 0.1309659481048584, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "ref_ce_loss": 0.08222094178199768, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "loss": 0.4834359288215637, + "step": 11110 + }, + { + "ce_loss": 0.1207953616976738, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "distill_loss": 0.1629929095506668, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "ref_ce_loss": 0.11464487761259079, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "loss": 0.2799364924430847, + "step": 11110 + }, + { + "ce_loss": 0.04275348037481308, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "distill_loss": 0.12101711332798004, + "epoch": 3.7058038692461643, + "step": 11110 + }, + { + "epoch": 3.7058038692461643, + "ref_ce_loss": 0.07686648517847061, + "step": 11110 + }, + { + "epoch": 3.7091394262841897, + "loss": 0.4836, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "grad_norm": 2.729823589324951, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "learning_rate": 0.0001748406020064708, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "loss": 0.4658726155757904, + "step": 11120 + }, + { + "ce_loss": 0.1476747840642929, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "distill_loss": 0.21464860439300537, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "ref_ce_loss": 0.08882784843444824, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "loss": 0.37286797165870667, + "step": 11120 + }, + { + "ce_loss": 0.05277108773589134, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "distill_loss": 0.21334585547447205, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "ref_ce_loss": 0.07608553767204285, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "loss": 0.32241755723953247, + "step": 11120 + }, + { + "ce_loss": 0.07858740538358688, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "distill_loss": 0.10054327547550201, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "ref_ce_loss": 0.09804639965295792, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "loss": 0.35880106687545776, + "step": 11120 + }, + { + "ce_loss": 0.08212389796972275, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "distill_loss": 0.12222301214933395, + "epoch": 3.7091394262841897, + "step": 11120 + }, + { + "epoch": 3.7091394262841897, + "ref_ce_loss": 0.11375194042921066, + "step": 11120 + }, + { + "epoch": 3.712474983322215, + "loss": 0.4209, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "grad_norm": 2.9692301750183105, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "learning_rate": 0.0001746408148943285, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "loss": 0.53624027967453, + "step": 11130 + }, + { + "ce_loss": 0.13671307265758514, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "distill_loss": 0.11101743578910828, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "ref_ce_loss": 0.11485659331083298, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "loss": 0.5098305940628052, + "step": 11130 + }, + { + "ce_loss": 0.17487110197544098, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "distill_loss": 0.1443914771080017, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "ref_ce_loss": 0.12564757466316223, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "loss": 0.38041433691978455, + "step": 11130 + }, + { + "ce_loss": 0.11597348749637604, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "distill_loss": 0.09810422360897064, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "ref_ce_loss": 0.10537905246019363, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "loss": 0.4296853244304657, + "step": 11130 + }, + { + "ce_loss": 0.15972864627838135, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "distill_loss": 0.13555540144443512, + "epoch": 3.712474983322215, + "step": 11130 + }, + { + "epoch": 3.712474983322215, + "ref_ce_loss": 0.13421572744846344, + "step": 11130 + }, + { + "epoch": 3.7158105403602404, + "loss": 0.5083, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "grad_norm": 2.798766851425171, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "learning_rate": 0.00017444098284705983, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "loss": 0.32998281717300415, + "step": 11140 + }, + { + "ce_loss": 0.0770777240395546, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "distill_loss": 0.14420868456363678, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "ref_ce_loss": 0.06987477093935013, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "loss": 0.30745163559913635, + "step": 11140 + }, + { + "ce_loss": 0.10314347594976425, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "distill_loss": 0.12897643446922302, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "ref_ce_loss": 0.07500158250331879, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "loss": 0.5746026635169983, + "step": 11140 + }, + { + "ce_loss": 0.20710504055023193, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "distill_loss": 0.14439168572425842, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "ref_ce_loss": 0.15772093832492828, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "loss": 0.4745178520679474, + "step": 11140 + }, + { + "ce_loss": 0.10678113996982574, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "distill_loss": 0.1293291449546814, + "epoch": 3.7158105403602404, + "step": 11140 + }, + { + "epoch": 3.7158105403602404, + "ref_ce_loss": 0.10236824303865433, + "step": 11140 + }, + { + "epoch": 3.7191460973982657, + "loss": 0.4361, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "grad_norm": 3.206498861312866, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "learning_rate": 0.0001742411062290796, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "loss": 0.4355878531932831, + "step": 11150 + }, + { + "ce_loss": 0.12951213121414185, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "distill_loss": 0.1468600034713745, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "ref_ce_loss": 0.12128368765115738, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "loss": 0.5208489298820496, + "step": 11150 + }, + { + "ce_loss": 0.18008677661418915, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "distill_loss": 0.1475437879562378, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "ref_ce_loss": 0.12079443037509918, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "loss": 0.18894127011299133, + "step": 11150 + }, + { + "ce_loss": 0.04160792753100395, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "distill_loss": 0.061185117810964584, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "ref_ce_loss": 0.08577242493629456, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "loss": 0.3723412752151489, + "step": 11150 + }, + { + "ce_loss": 0.10669536143541336, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "distill_loss": 0.09243173897266388, + "epoch": 3.7191460973982657, + "step": 11150 + }, + { + "epoch": 3.7191460973982657, + "ref_ce_loss": 0.11789239197969437, + "step": 11150 + }, + { + "epoch": 3.722481654436291, + "loss": 0.4262, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "grad_norm": 2.0635910034179688, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "learning_rate": 0.00017404118540488396, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "loss": 0.4313686788082123, + "step": 11160 + }, + { + "ce_loss": 0.12093643099069595, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "distill_loss": 0.1406337320804596, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "ref_ce_loss": 0.10394150018692017, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "loss": 0.5439311265945435, + "step": 11160 + }, + { + "ce_loss": 0.14441914856433868, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "distill_loss": 0.18547749519348145, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "ref_ce_loss": 0.16917400062084198, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "loss": 0.4756580591201782, + "step": 11160 + }, + { + "ce_loss": 0.0756123811006546, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "distill_loss": 0.17472757399082184, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "ref_ce_loss": 0.10686322301626205, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "loss": 0.4173581600189209, + "step": 11160 + }, + { + "ce_loss": 0.10815001279115677, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "distill_loss": 0.16694800555706024, + "epoch": 3.722481654436291, + "step": 11160 + }, + { + "epoch": 3.722481654436291, + "ref_ce_loss": 0.10757031291723251, + "step": 11160 + }, + { + "epoch": 3.7258172114743164, + "loss": 0.5187, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "grad_norm": 2.583024501800537, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "learning_rate": 0.00017384122073904964, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "loss": 0.4847860038280487, + "step": 11170 + }, + { + "ce_loss": 0.1676764339208603, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "distill_loss": 0.14405414462089539, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "ref_ce_loss": 0.12641099095344543, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "loss": 0.3804580867290497, + "step": 11170 + }, + { + "ce_loss": 0.14094428718090057, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "distill_loss": 0.13992449641227722, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "ref_ce_loss": 0.06798779219388962, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "loss": 0.3895184099674225, + "step": 11170 + }, + { + "ce_loss": 0.12548105418682098, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "distill_loss": 0.16592243313789368, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "ref_ce_loss": 0.09731275588274002, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "loss": 0.47543710470199585, + "step": 11170 + }, + { + "ce_loss": 0.15646733343601227, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "distill_loss": 0.12935645878314972, + "epoch": 3.7258172114743164, + "step": 11170 + }, + { + "epoch": 3.7258172114743164, + "ref_ce_loss": 0.11045189946889877, + "step": 11170 + }, + { + "epoch": 3.729152768512342, + "loss": 0.4764, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "grad_norm": 3.158120632171631, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "learning_rate": 0.00017364121259623327, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "loss": 0.3678155243396759, + "step": 11180 + }, + { + "ce_loss": 0.11589225381612778, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "distill_loss": 0.12303879857063293, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "ref_ce_loss": 0.12825645506381989, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "loss": 0.5115286111831665, + "step": 11180 + }, + { + "ce_loss": 0.13510024547576904, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "distill_loss": 0.22685614228248596, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "ref_ce_loss": 0.11010359972715378, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "loss": 0.4209415316581726, + "step": 11180 + }, + { + "ce_loss": 0.17654968798160553, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "distill_loss": 0.12133859843015671, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "ref_ce_loss": 0.09999922662973404, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "loss": 0.46903812885284424, + "step": 11180 + }, + { + "ce_loss": 0.15544381737709045, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "distill_loss": 0.1415010392665863, + "epoch": 3.729152768512342, + "step": 11180 + }, + { + "epoch": 3.729152768512342, + "ref_ce_loss": 0.108729287981987, + "step": 11180 + }, + { + "epoch": 3.732488325550367, + "loss": 0.4928, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "grad_norm": 3.1810953617095947, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "learning_rate": 0.0001734411613411708, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "loss": 0.46836310625076294, + "step": 11190 + }, + { + "ce_loss": 0.10594028979539871, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "distill_loss": 0.13372491300106049, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "ref_ce_loss": 0.08447429537773132, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "loss": 0.4483862519264221, + "step": 11190 + }, + { + "ce_loss": 0.09653332829475403, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "distill_loss": 0.13661277294158936, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "ref_ce_loss": 0.10244203358888626, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "loss": 0.5003663897514343, + "step": 11190 + }, + { + "ce_loss": 0.14703114330768585, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "distill_loss": 0.18907728791236877, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "ref_ce_loss": 0.12094144523143768, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "loss": 0.7119038105010986, + "step": 11190 + }, + { + "ce_loss": 0.1835232377052307, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "distill_loss": 0.20460136234760284, + "epoch": 3.732488325550367, + "step": 11190 + }, + { + "epoch": 3.732488325550367, + "ref_ce_loss": 0.10444125533103943, + "step": 11190 + }, + { + "epoch": 3.7358238825883925, + "loss": 0.4707, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "grad_norm": 2.984477996826172, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "learning_rate": 0.00017324106733867687, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "loss": 0.351445734500885, + "step": 11200 + }, + { + "ce_loss": 0.09723985940217972, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "distill_loss": 0.11793617159128189, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "ref_ce_loss": 0.07710584253072739, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "loss": 0.6089062690734863, + "step": 11200 + }, + { + "ce_loss": 0.16752246022224426, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "distill_loss": 0.12733298540115356, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "ref_ce_loss": 0.1492699831724167, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "loss": 0.38957151770591736, + "step": 11200 + }, + { + "ce_loss": 0.13466042280197144, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "distill_loss": 0.12702538073062897, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "ref_ce_loss": 0.08084917068481445, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "loss": 0.48998987674713135, + "step": 11200 + }, + { + "ce_loss": 0.1286323070526123, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "distill_loss": 0.11189994215965271, + "epoch": 3.7358238825883925, + "step": 11200 + }, + { + "epoch": 3.7358238825883925, + "ref_ce_loss": 0.11733870953321457, + "step": 11200 + }, + { + "epoch": 3.739159439626418, + "loss": 0.4959, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "grad_norm": 2.2686338424682617, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "learning_rate": 0.0001730409309536439, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "loss": 0.7203350067138672, + "step": 11210 + }, + { + "ce_loss": 0.12428060919046402, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "distill_loss": 0.13582146167755127, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "ref_ce_loss": 0.10568521171808243, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "loss": 0.6753644943237305, + "step": 11210 + }, + { + "ce_loss": 0.12828564643859863, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "distill_loss": 0.20919077098369598, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "ref_ce_loss": 0.11066607385873795, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "loss": 1.4700936079025269, + "step": 11210 + }, + { + "ce_loss": 0.07830823957920074, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "distill_loss": 0.1397976577281952, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "ref_ce_loss": 0.06886852532625198, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "loss": 0.6401880979537964, + "step": 11210 + }, + { + "ce_loss": 0.07730946689844131, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "distill_loss": 0.11617650091648102, + "epoch": 3.739159439626418, + "step": 11210 + }, + { + "epoch": 3.739159439626418, + "ref_ce_loss": 0.0711132138967514, + "step": 11210 + }, + { + "epoch": 3.742494996664443, + "loss": 0.5192, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "grad_norm": 3.824856758117676, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "learning_rate": 0.00017284075255104186, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "loss": 0.433307409286499, + "step": 11220 + }, + { + "ce_loss": 0.12180997431278229, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "distill_loss": 0.1235133558511734, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "ref_ce_loss": 0.09671497344970703, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "loss": 0.42754268646240234, + "step": 11220 + }, + { + "ce_loss": 0.15168742835521698, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "distill_loss": 0.1420907974243164, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "ref_ce_loss": 0.09833323955535889, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "loss": 0.4262707531452179, + "step": 11220 + }, + { + "ce_loss": 0.16959330439567566, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "distill_loss": 0.1454181671142578, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "ref_ce_loss": 0.08515819907188416, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "loss": 0.34843114018440247, + "step": 11220 + }, + { + "ce_loss": 0.0687236413359642, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "distill_loss": 0.14176149666309357, + "epoch": 3.742494996664443, + "step": 11220 + }, + { + "epoch": 3.742494996664443, + "ref_ce_loss": 0.09360263496637344, + "step": 11220 + }, + { + "epoch": 3.7458305537024685, + "loss": 0.4694, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "grad_norm": 4.92000675201416, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "learning_rate": 0.00017264053249591704, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "loss": 0.609779953956604, + "step": 11230 + }, + { + "ce_loss": 0.1590002477169037, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "distill_loss": 0.14490337669849396, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "ref_ce_loss": 0.14144301414489746, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "loss": 0.47014445066452026, + "step": 11230 + }, + { + "ce_loss": 0.1638866811990738, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "distill_loss": 0.14338326454162598, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "ref_ce_loss": 0.12581142783164978, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "loss": 0.3591836094856262, + "step": 11230 + }, + { + "ce_loss": 0.11410403251647949, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "distill_loss": 0.13631364703178406, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "ref_ce_loss": 0.0813060849905014, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "loss": 0.5285924077033997, + "step": 11230 + }, + { + "ce_loss": 0.2021360844373703, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "distill_loss": 0.19308476150035858, + "epoch": 3.7458305537024685, + "step": 11230 + }, + { + "epoch": 3.7458305537024685, + "ref_ce_loss": 0.10977895557880402, + "step": 11230 + }, + { + "epoch": 3.749166110740494, + "loss": 0.4772, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "grad_norm": 3.521881103515625, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "learning_rate": 0.00017244027115339192, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "loss": 0.37861761450767517, + "step": 11240 + }, + { + "ce_loss": 0.11489034444093704, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "distill_loss": 0.12115241587162018, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "ref_ce_loss": 0.10814513266086578, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "loss": 0.42796438932418823, + "step": 11240 + }, + { + "ce_loss": 0.13979476690292358, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "distill_loss": 0.12097348272800446, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "ref_ce_loss": 0.0843663215637207, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "loss": 0.3992958068847656, + "step": 11240 + }, + { + "ce_loss": 0.10800452530384064, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "distill_loss": 0.17938587069511414, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "ref_ce_loss": 0.07487044483423233, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "loss": 0.808132529258728, + "step": 11240 + }, + { + "ce_loss": 0.1502559930086136, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "distill_loss": 0.1359129697084427, + "epoch": 3.749166110740494, + "step": 11240 + }, + { + "epoch": 3.749166110740494, + "ref_ce_loss": 0.11720101535320282, + "step": 11240 + }, + { + "epoch": 3.7525016677785192, + "loss": 0.4801, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "grad_norm": 3.7433605194091797, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "learning_rate": 0.00017223996888866423, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "loss": 0.4895704984664917, + "step": 11250 + }, + { + "ce_loss": 0.186688631772995, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "distill_loss": 0.12781491875648499, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "ref_ce_loss": 0.12212307751178741, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "loss": 0.4208083152770996, + "step": 11250 + }, + { + "ce_loss": 0.10923559218645096, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "distill_loss": 0.134412482380867, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "ref_ce_loss": 0.14637266099452972, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "loss": 0.48236986994743347, + "step": 11250 + }, + { + "ce_loss": 0.13481567800045013, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "distill_loss": 0.12056757509708405, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "ref_ce_loss": 0.11300322413444519, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "loss": 0.33931252360343933, + "step": 11250 + }, + { + "ce_loss": 0.09116953611373901, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "distill_loss": 0.15315291285514832, + "epoch": 3.7525016677785192, + "step": 11250 + }, + { + "epoch": 3.7525016677785192, + "ref_ce_loss": 0.09439779818058014, + "step": 11250 + }, + { + "epoch": 3.7558372248165446, + "loss": 0.4252, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "grad_norm": 2.2029407024383545, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "learning_rate": 0.00017203962606700618, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "loss": 0.7249472141265869, + "step": 11260 + }, + { + "ce_loss": 0.09998872131109238, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "distill_loss": 0.1515664905309677, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "ref_ce_loss": 0.12862011790275574, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "loss": 0.5033693909645081, + "step": 11260 + }, + { + "ce_loss": 0.20025144517421722, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "distill_loss": 0.13580387830734253, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "ref_ce_loss": 0.13315941393375397, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "loss": 0.3584093451499939, + "step": 11260 + }, + { + "ce_loss": 0.12843570113182068, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "distill_loss": 0.12751731276512146, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "ref_ce_loss": 0.10165061056613922, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "loss": 0.416264146566391, + "step": 11260 + }, + { + "ce_loss": 0.15564116835594177, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "distill_loss": 0.11993543803691864, + "epoch": 3.7558372248165446, + "step": 11260 + }, + { + "epoch": 3.7558372248165446, + "ref_ce_loss": 0.1401854306459427, + "step": 11260 + }, + { + "epoch": 3.75917278185457, + "loss": 0.4981, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "grad_norm": 4.698095321655273, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "learning_rate": 0.00017183924305376415, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "loss": 0.7320033311843872, + "step": 11270 + }, + { + "ce_loss": 0.2233872264623642, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "distill_loss": 0.14781275391578674, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "ref_ce_loss": 0.13521909713745117, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "loss": 0.3893508017063141, + "step": 11270 + }, + { + "ce_loss": 0.1054534986615181, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "distill_loss": 0.11295004189014435, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "ref_ce_loss": 0.08586245775222778, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "loss": 0.4391804039478302, + "step": 11270 + }, + { + "ce_loss": 0.08601868897676468, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "distill_loss": 0.13763193786144257, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "ref_ce_loss": 0.0853324756026268, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "loss": 0.33526211977005005, + "step": 11270 + }, + { + "ce_loss": 0.03173764422535896, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "distill_loss": 0.09609787166118622, + "epoch": 3.75917278185457, + "step": 11270 + }, + { + "epoch": 3.75917278185457, + "ref_ce_loss": 0.0757674053311348, + "step": 11270 + }, + { + "epoch": 3.7625083388925953, + "loss": 0.4434, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "grad_norm": 2.5927469730377197, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "learning_rate": 0.00017163882021435764, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "loss": 0.4663512706756592, + "step": 11280 + }, + { + "ce_loss": 0.15741774439811707, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "distill_loss": 0.14555154740810394, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "ref_ce_loss": 0.10073420405387878, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "loss": 0.36396872997283936, + "step": 11280 + }, + { + "ce_loss": 0.11238844692707062, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "distill_loss": 0.1512582004070282, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "ref_ce_loss": 0.10009444504976273, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "loss": 0.3391995131969452, + "step": 11280 + }, + { + "ce_loss": 0.11951491981744766, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "distill_loss": 0.10620466619729996, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "ref_ce_loss": 0.07334575802087784, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "loss": 0.4804520606994629, + "step": 11280 + }, + { + "ce_loss": 0.07516561448574066, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "distill_loss": 0.1316811740398407, + "epoch": 3.7625083388925953, + "step": 11280 + }, + { + "epoch": 3.7625083388925953, + "ref_ce_loss": 0.08692770451307297, + "step": 11280 + }, + { + "epoch": 3.7658438959306206, + "loss": 0.5076, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "grad_norm": 4.046371936798096, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "learning_rate": 0.00017143835791427888, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "loss": 0.6690834164619446, + "step": 11290 + }, + { + "ce_loss": 0.15648917853832245, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "distill_loss": 0.188400000333786, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "ref_ce_loss": 0.10655590146780014, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "loss": 0.24589793384075165, + "step": 11290 + }, + { + "ce_loss": 0.04482785612344742, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "distill_loss": 0.09695935249328613, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "ref_ce_loss": 0.06211121380329132, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "loss": 0.24969103932380676, + "step": 11290 + }, + { + "ce_loss": 0.061918340623378754, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "distill_loss": 0.09989786893129349, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "ref_ce_loss": 0.08783142268657684, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "loss": 0.3797447085380554, + "step": 11290 + }, + { + "ce_loss": 0.09772893786430359, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "distill_loss": 0.13673144578933716, + "epoch": 3.7658438959306206, + "step": 11290 + }, + { + "epoch": 3.7658438959306206, + "ref_ce_loss": 0.09559912979602814, + "step": 11290 + }, + { + "epoch": 3.769179452968646, + "loss": 0.4614, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "grad_norm": 3.4041154384613037, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "learning_rate": 0.0001712378565190921, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "loss": 0.37293708324432373, + "step": 11300 + }, + { + "ce_loss": 0.09539689123630524, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "distill_loss": 0.153346985578537, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "ref_ce_loss": 0.08379501849412918, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "loss": 0.3446539044380188, + "step": 11300 + }, + { + "ce_loss": 0.10582612454891205, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "distill_loss": 0.12128449976444244, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "ref_ce_loss": 0.09659599512815475, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "loss": 0.7190386056900024, + "step": 11300 + }, + { + "ce_loss": 0.08890789747238159, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "distill_loss": 0.12199924141168594, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "ref_ce_loss": 0.12765192985534668, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "loss": 0.41915977001190186, + "step": 11300 + }, + { + "ce_loss": 0.13943921029567719, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "distill_loss": 0.16844049096107483, + "epoch": 3.769179452968646, + "step": 11300 + }, + { + "epoch": 3.769179452968646, + "ref_ce_loss": 0.11106761544942856, + "step": 11300 + }, + { + "epoch": 3.7725150100066713, + "loss": 0.4793, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "grad_norm": 4.181412696838379, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "learning_rate": 0.0001710373163944326, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "loss": 0.48942622542381287, + "step": 11310 + }, + { + "ce_loss": 0.17593348026275635, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "distill_loss": 0.13092395663261414, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "ref_ce_loss": 0.10657544434070587, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "loss": 0.2934868335723877, + "step": 11310 + }, + { + "ce_loss": 0.07878759503364563, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "distill_loss": 0.09831201285123825, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "ref_ce_loss": 0.08740630000829697, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "loss": 0.3965328633785248, + "step": 11310 + }, + { + "ce_loss": 0.09016729146242142, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "distill_loss": 0.09539443999528885, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "ref_ce_loss": 0.07570146769285202, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "loss": 0.5012593269348145, + "step": 11310 + }, + { + "ce_loss": 0.1913784295320511, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "distill_loss": 0.16016805171966553, + "epoch": 3.7725150100066713, + "step": 11310 + }, + { + "epoch": 3.7725150100066713, + "ref_ce_loss": 0.11406117677688599, + "step": 11310 + }, + { + "epoch": 3.7758505670446967, + "loss": 0.486, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "grad_norm": 3.193751573562622, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "learning_rate": 0.00017083673790600648, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "loss": 1.0421459674835205, + "step": 11320 + }, + { + "ce_loss": 0.145387202501297, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "distill_loss": 0.19037388265132904, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "ref_ce_loss": 0.10654689371585846, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "loss": 0.6212438344955444, + "step": 11320 + }, + { + "ce_loss": 0.14684922993183136, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "distill_loss": 0.1758728325366974, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "ref_ce_loss": 0.12208641320466995, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "loss": 0.6268714070320129, + "step": 11320 + }, + { + "ce_loss": 0.20063500106334686, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "distill_loss": 0.16264456510543823, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "ref_ce_loss": 0.07537572830915451, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "loss": 0.6568611860275269, + "step": 11320 + }, + { + "ce_loss": 0.1356017291545868, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "distill_loss": 0.17617285251617432, + "epoch": 3.7758505670446967, + "step": 11320 + }, + { + "epoch": 3.7758505670446967, + "ref_ce_loss": 0.09102947264909744, + "step": 11320 + }, + { + "epoch": 3.779186124082722, + "loss": 0.5344, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "grad_norm": 2.6559865474700928, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "learning_rate": 0.00017063612141958996, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "loss": 0.5010161399841309, + "step": 11330 + }, + { + "ce_loss": 0.17269544303417206, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "distill_loss": 0.1870214194059372, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "ref_ce_loss": 0.09947448968887329, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "loss": 0.9882927536964417, + "step": 11330 + }, + { + "ce_loss": 0.2070547342300415, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "distill_loss": 0.2346445918083191, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "ref_ce_loss": 0.1403355449438095, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "loss": 0.37922123074531555, + "step": 11330 + }, + { + "ce_loss": 0.10499770939350128, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "distill_loss": 0.17015215754508972, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "ref_ce_loss": 0.10365436226129532, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "loss": 0.5613137483596802, + "step": 11330 + }, + { + "ce_loss": 0.08903953433036804, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "distill_loss": 0.16297371685504913, + "epoch": 3.779186124082722, + "step": 11330 + }, + { + "epoch": 3.779186124082722, + "ref_ce_loss": 0.07672325521707535, + "step": 11330 + }, + { + "epoch": 3.7825216811207474, + "loss": 0.4881, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "grad_norm": 3.2749686241149902, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "learning_rate": 0.00017043546730102823, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "loss": 0.39569780230522156, + "step": 11340 + }, + { + "ce_loss": 0.09518209844827652, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "distill_loss": 0.1157209724187851, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "ref_ce_loss": 0.07844128459692001, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "loss": 1.2595678567886353, + "step": 11340 + }, + { + "ce_loss": 0.14451389014720917, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "distill_loss": 0.1210441067814827, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "ref_ce_loss": 0.13017107546329498, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "loss": 0.9055213928222656, + "step": 11340 + }, + { + "ce_loss": 0.2113693356513977, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "distill_loss": 0.1787041276693344, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "ref_ce_loss": 0.07526680827140808, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "loss": 0.3183319568634033, + "step": 11340 + }, + { + "ce_loss": 0.08012717217206955, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "distill_loss": 0.1112750992178917, + "epoch": 3.7825216811207474, + "step": 11340 + }, + { + "epoch": 3.7825216811207474, + "ref_ce_loss": 0.09437817335128784, + "step": 11340 + }, + { + "epoch": 3.7858572381587727, + "loss": 0.5804, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "grad_norm": 2.631268262863159, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "learning_rate": 0.00017023477591623524, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "loss": 0.5030439496040344, + "step": 11350 + }, + { + "ce_loss": 0.09157869219779968, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "distill_loss": 0.11663976311683655, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "ref_ce_loss": 0.11903272569179535, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "loss": 0.467189759016037, + "step": 11350 + }, + { + "ce_loss": 0.12028718739748001, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "distill_loss": 0.17001944780349731, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "ref_ce_loss": 0.12809155881404877, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "loss": 0.4767748713493347, + "step": 11350 + }, + { + "ce_loss": 0.1660422831773758, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "distill_loss": 0.11231112480163574, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "ref_ce_loss": 0.1354854255914688, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "loss": 0.30572032928466797, + "step": 11350 + }, + { + "ce_loss": 0.05527346208691597, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "distill_loss": 0.11191977560520172, + "epoch": 3.7858572381587727, + "step": 11350 + }, + { + "epoch": 3.7858572381587727, + "ref_ce_loss": 0.0689462423324585, + "step": 11350 + }, + { + "epoch": 3.789192795196798, + "loss": 0.4704, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "grad_norm": 2.8327856063842773, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "learning_rate": 0.000170034047631193, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "loss": 0.3354668617248535, + "step": 11360 + }, + { + "ce_loss": 0.08834193646907806, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "distill_loss": 0.13074669241905212, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "ref_ce_loss": 0.0854744091629982, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "loss": 0.6441950798034668, + "step": 11360 + }, + { + "ce_loss": 0.1427772492170334, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "distill_loss": 0.13033153116703033, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "ref_ce_loss": 0.0994003564119339, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "loss": 0.3823297619819641, + "step": 11360 + }, + { + "ce_loss": 0.15958751738071442, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "distill_loss": 0.11525478959083557, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "ref_ce_loss": 0.08661943674087524, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "loss": 0.41648799180984497, + "step": 11360 + }, + { + "ce_loss": 0.0783916637301445, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "distill_loss": 0.10013296455144882, + "epoch": 3.789192795196798, + "step": 11360 + }, + { + "epoch": 3.789192795196798, + "ref_ce_loss": 0.15111957490444183, + "step": 11360 + }, + { + "epoch": 3.7925283522348234, + "loss": 0.5495, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "grad_norm": 5.2707719802856445, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "learning_rate": 0.0001698332828119506, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "loss": 0.8329063057899475, + "step": 11370 + }, + { + "ce_loss": 0.0788368433713913, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "distill_loss": 0.1925995647907257, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "ref_ce_loss": 0.15627038478851318, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "loss": 0.4649633467197418, + "step": 11370 + }, + { + "ce_loss": 0.1610284000635147, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "distill_loss": 0.12861159443855286, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "ref_ce_loss": 0.07107669115066528, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "loss": 0.44461050629615784, + "step": 11370 + }, + { + "ce_loss": 0.13306909799575806, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "distill_loss": 0.1817498505115509, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "ref_ce_loss": 0.11042030155658722, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "loss": 0.605265736579895, + "step": 11370 + }, + { + "ce_loss": 0.22842943668365479, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "distill_loss": 0.18195891380310059, + "epoch": 3.7925283522348234, + "step": 11370 + }, + { + "epoch": 3.7925283522348234, + "ref_ce_loss": 0.16252247989177704, + "step": 11370 + }, + { + "epoch": 3.795863909272849, + "loss": 0.4597, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "grad_norm": 2.723623037338257, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "learning_rate": 0.00016963248182462397, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "loss": 0.5169832110404968, + "step": 11380 + }, + { + "ce_loss": 0.11923842132091522, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "distill_loss": 0.1981431394815445, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "ref_ce_loss": 0.08852661401033401, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "loss": 0.38678765296936035, + "step": 11380 + }, + { + "ce_loss": 0.12697753310203552, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "distill_loss": 0.14126920700073242, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "ref_ce_loss": 0.05947829782962799, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "loss": 0.5108277201652527, + "step": 11380 + }, + { + "ce_loss": 0.0795733779668808, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "distill_loss": 0.12557706236839294, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "ref_ce_loss": 0.09753233939409256, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "loss": 0.4048956632614136, + "step": 11380 + }, + { + "ce_loss": 0.13374558091163635, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "distill_loss": 0.15967635810375214, + "epoch": 3.795863909272849, + "step": 11380 + }, + { + "epoch": 3.795863909272849, + "ref_ce_loss": 0.08542696386575699, + "step": 11380 + }, + { + "epoch": 3.799199466310874, + "loss": 0.5665, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "grad_norm": 3.3278257846832275, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "learning_rate": 0.00016943164503539491, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "loss": 0.45715150237083435, + "step": 11390 + }, + { + "ce_loss": 0.12858489155769348, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "distill_loss": 0.15878446400165558, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "ref_ce_loss": 0.13602490723133087, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "loss": 0.5386162996292114, + "step": 11390 + }, + { + "ce_loss": 0.15765678882598877, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "distill_loss": 0.15718784928321838, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "ref_ce_loss": 0.15782934427261353, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "loss": 0.3341408371925354, + "step": 11390 + }, + { + "ce_loss": 0.0596722736954689, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "distill_loss": 0.13757136464118958, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "ref_ce_loss": 0.07602232694625854, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "loss": 0.6375399231910706, + "step": 11390 + }, + { + "ce_loss": 0.21162016689777374, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "distill_loss": 0.20884963870048523, + "epoch": 3.799199466310874, + "step": 11390 + }, + { + "epoch": 3.799199466310874, + "ref_ce_loss": 0.11842446774244308, + "step": 11390 + }, + { + "epoch": 3.8025350233488995, + "loss": 0.4789, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "grad_norm": 2.8649020195007324, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "learning_rate": 0.00016923077281051041, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "loss": 0.4338218569755554, + "step": 11400 + }, + { + "ce_loss": 0.14233727753162384, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "distill_loss": 0.1555687040090561, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "ref_ce_loss": 0.0952368825674057, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "loss": 0.45588257908821106, + "step": 11400 + }, + { + "ce_loss": 0.0950573980808258, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "distill_loss": 0.1703718602657318, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "ref_ce_loss": 0.10461972653865814, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "loss": 0.349023699760437, + "step": 11400 + }, + { + "ce_loss": 0.12215173244476318, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "distill_loss": 0.13151374459266663, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "ref_ce_loss": 0.08300989866256714, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "loss": 0.5359344482421875, + "step": 11400 + }, + { + "ce_loss": 0.07137981802225113, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "distill_loss": 0.15614856779575348, + "epoch": 3.8025350233488995, + "step": 11400 + }, + { + "epoch": 3.8025350233488995, + "ref_ce_loss": 0.10713817179203033, + "step": 11400 + }, + { + "epoch": 3.805870580386925, + "loss": 0.5378, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "grad_norm": 1.8696403503417969, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "learning_rate": 0.00016902986551628227, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "loss": 0.7627152800559998, + "step": 11410 + }, + { + "ce_loss": 0.16251112520694733, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "distill_loss": 0.18291950225830078, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "ref_ce_loss": 0.06121499091386795, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "loss": 0.41648682951927185, + "step": 11410 + }, + { + "ce_loss": 0.1059994250535965, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "distill_loss": 0.14188294112682343, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "ref_ce_loss": 0.10719886422157288, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "loss": 0.6039667725563049, + "step": 11410 + }, + { + "ce_loss": 0.11324057728052139, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "distill_loss": 0.15148040652275085, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "ref_ce_loss": 0.1474461853504181, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "loss": 0.5163719654083252, + "step": 11410 + }, + { + "ce_loss": 0.15379080176353455, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "distill_loss": 0.17272308468818665, + "epoch": 3.805870580386925, + "step": 11410 + }, + { + "epoch": 3.805870580386925, + "ref_ce_loss": 0.1529800295829773, + "step": 11410 + }, + { + "epoch": 3.80920613742495, + "loss": 0.5488, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "grad_norm": 4.255395889282227, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "learning_rate": 0.00016882892351908606, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "loss": 0.7047617435455322, + "step": 11420 + }, + { + "ce_loss": 0.08719105273485184, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "distill_loss": 0.14586789906024933, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "ref_ce_loss": 0.06734959781169891, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "loss": 0.3226097524166107, + "step": 11420 + }, + { + "ce_loss": 0.10788920521736145, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "distill_loss": 0.10606271028518677, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "ref_ce_loss": 0.10857447236776352, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "loss": 0.506752073764801, + "step": 11420 + }, + { + "ce_loss": 0.20389452576637268, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "distill_loss": 0.1739177256822586, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "ref_ce_loss": 0.08177173137664795, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "loss": 0.4029782712459564, + "step": 11420 + }, + { + "ce_loss": 0.10493995994329453, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "distill_loss": 0.1328524351119995, + "epoch": 3.80920613742495, + "step": 11420 + }, + { + "epoch": 3.80920613742495, + "ref_ce_loss": 0.11656699329614639, + "step": 11420 + }, + { + "epoch": 3.8125416944629755, + "loss": 0.5567, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "grad_norm": 1.993993878364563, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "learning_rate": 0.0001686279471853608, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "loss": 0.955905556678772, + "step": 11430 + }, + { + "ce_loss": 0.1232018992304802, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "distill_loss": 0.20480038225650787, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "ref_ce_loss": 0.10530410706996918, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "loss": 0.543412446975708, + "step": 11430 + }, + { + "ce_loss": 0.18799374997615814, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "distill_loss": 0.1597263514995575, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "ref_ce_loss": 0.11109674721956253, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "loss": 0.33423781394958496, + "step": 11430 + }, + { + "ce_loss": 0.10933635383844376, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "distill_loss": 0.13336357474327087, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "ref_ce_loss": 0.0713241845369339, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "loss": 0.45266109704971313, + "step": 11430 + }, + { + "ce_loss": 0.14071442186832428, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "distill_loss": 0.1235266625881195, + "epoch": 3.8125416944629755, + "step": 11430 + }, + { + "epoch": 3.8125416944629755, + "ref_ce_loss": 0.10337050259113312, + "step": 11430 + }, + { + "epoch": 3.815877251501001, + "loss": 0.4643, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "grad_norm": 2.24055552482605, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "learning_rate": 0.00016842693688160794, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "loss": 0.5264979004859924, + "step": 11440 + }, + { + "ce_loss": 0.17195414006710052, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "distill_loss": 0.1488429754972458, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "ref_ce_loss": 0.14474719762802124, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "loss": 0.4335517883300781, + "step": 11440 + }, + { + "ce_loss": 0.13938097655773163, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "distill_loss": 0.14782142639160156, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "ref_ce_loss": 0.06512699276208878, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "loss": 0.4729892611503601, + "step": 11440 + }, + { + "ce_loss": 0.11497749388217926, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "distill_loss": 0.1210591122508049, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "ref_ce_loss": 0.1321076601743698, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "loss": 0.8550109267234802, + "step": 11440 + }, + { + "ce_loss": 0.14266787469387054, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "distill_loss": 0.16640591621398926, + "epoch": 3.815877251501001, + "step": 11440 + }, + { + "epoch": 3.815877251501001, + "ref_ce_loss": 0.1064901351928711, + "step": 11440 + }, + { + "epoch": 3.8192128085390262, + "loss": 0.4977, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "grad_norm": 3.0378942489624023, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "learning_rate": 0.00016822589297439108, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "loss": 0.8309723138809204, + "step": 11450 + }, + { + "ce_loss": 0.18916484713554382, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "distill_loss": 0.17502309381961823, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "ref_ce_loss": 0.09515126049518585, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "loss": 0.7633737325668335, + "step": 11450 + }, + { + "ce_loss": 0.3523317873477936, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "distill_loss": 0.235255628824234, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "ref_ce_loss": 0.13086393475532532, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "loss": 0.5058243274688721, + "step": 11450 + }, + { + "ce_loss": 0.19745786488056183, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "distill_loss": 0.13970790803432465, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "ref_ce_loss": 0.12864728271961212, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "loss": 0.9811885356903076, + "step": 11450 + }, + { + "ce_loss": 0.13506951928138733, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "distill_loss": 0.12431784719228745, + "epoch": 3.8192128085390262, + "step": 11450 + }, + { + "epoch": 3.8192128085390262, + "ref_ce_loss": 0.09122858941555023, + "step": 11450 + }, + { + "epoch": 3.8225483655770516, + "loss": 0.5499, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "grad_norm": 3.075512170791626, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "learning_rate": 0.00016802481583033495, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "loss": 0.40927478671073914, + "step": 11460 + }, + { + "ce_loss": 0.12293460220098495, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "distill_loss": 0.16415520012378693, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "ref_ce_loss": 0.08292596787214279, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "loss": 0.30401870608329773, + "step": 11460 + }, + { + "ce_loss": 0.0601293109357357, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "distill_loss": 0.11315567791461945, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "ref_ce_loss": 0.13054944574832916, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "loss": 1.0063207149505615, + "step": 11460 + }, + { + "ce_loss": 0.1392781138420105, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "distill_loss": 0.12168440222740173, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "ref_ce_loss": 0.09981101006269455, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "loss": 0.37048590183258057, + "step": 11460 + }, + { + "ce_loss": 0.09129716455936432, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "distill_loss": 0.1344550997018814, + "epoch": 3.8225483655770516, + "step": 11460 + }, + { + "epoch": 3.8225483655770516, + "ref_ce_loss": 0.07073692977428436, + "step": 11460 + }, + { + "epoch": 3.825883922615077, + "loss": 0.5073, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "grad_norm": 3.2191481590270996, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "learning_rate": 0.000167823705816125, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "loss": 0.48221296072006226, + "step": 11470 + }, + { + "ce_loss": 0.1248675063252449, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "distill_loss": 0.18250107765197754, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "ref_ce_loss": 0.13513994216918945, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "loss": 0.493794322013855, + "step": 11470 + }, + { + "ce_loss": 0.12715068459510803, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "distill_loss": 0.15496912598609924, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "ref_ce_loss": 0.1318623125553131, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "loss": 0.3657727837562561, + "step": 11470 + }, + { + "ce_loss": 0.13231192529201508, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "distill_loss": 0.16635209321975708, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "ref_ce_loss": 0.046196348965168, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "loss": 0.6422207951545715, + "step": 11470 + }, + { + "ce_loss": 0.14488328993320465, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "distill_loss": 0.14699134230613708, + "epoch": 3.825883922615077, + "step": 11470 + }, + { + "epoch": 3.825883922615077, + "ref_ce_loss": 0.1012740209698677, + "step": 11470 + }, + { + "epoch": 3.8292194796531023, + "loss": 0.4673, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "grad_norm": 2.7229583263397217, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "learning_rate": 0.0001676225632985065, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "loss": 0.9861547946929932, + "step": 11480 + }, + { + "ce_loss": 0.07622343301773071, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "distill_loss": 0.12430855631828308, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "ref_ce_loss": 0.1209321841597557, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "loss": 0.4856456220149994, + "step": 11480 + }, + { + "ce_loss": 0.1425744742155075, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "distill_loss": 0.2019084393978119, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "ref_ce_loss": 0.09371843934059143, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "loss": 0.6687341928482056, + "step": 11480 + }, + { + "ce_loss": 0.2130454182624817, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "distill_loss": 0.21954701840877533, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "ref_ce_loss": 0.10809019207954407, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "loss": 0.3015283942222595, + "step": 11480 + }, + { + "ce_loss": 0.08696311712265015, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "distill_loss": 0.11111010611057281, + "epoch": 3.8292194796531023, + "step": 11480 + }, + { + "epoch": 3.8292194796531023, + "ref_ce_loss": 0.0717904344201088, + "step": 11480 + }, + { + "epoch": 3.8325550366911276, + "loss": 0.5021, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "grad_norm": 2.285511016845703, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "learning_rate": 0.00016742138864428403, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "loss": 0.36162352561950684, + "step": 11490 + }, + { + "ce_loss": 0.09861333668231964, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "distill_loss": 0.12867295742034912, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "ref_ce_loss": 0.1342122107744217, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "loss": 0.5594727993011475, + "step": 11490 + }, + { + "ce_loss": 0.1684267222881317, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "distill_loss": 0.16716884076595306, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "ref_ce_loss": 0.09705770015716553, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "loss": 0.48375794291496277, + "step": 11490 + }, + { + "ce_loss": 0.19998034834861755, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "distill_loss": 0.13959825038909912, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "ref_ce_loss": 0.14406739175319672, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "loss": 0.46701711416244507, + "step": 11490 + }, + { + "ce_loss": 0.13540586829185486, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "distill_loss": 0.15161505341529846, + "epoch": 3.8325550366911276, + "step": 11490 + }, + { + "epoch": 3.8325550366911276, + "ref_ce_loss": 0.12113173305988312, + "step": 11490 + }, + { + "epoch": 3.835890593729153, + "loss": 0.4589, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "grad_norm": 2.968534469604492, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "learning_rate": 0.00016722018222032085, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "loss": 0.48139268159866333, + "step": 11500 + }, + { + "ce_loss": 0.1482585072517395, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "distill_loss": 0.16626609861850739, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "ref_ce_loss": 0.11868167668581009, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "loss": 0.46502596139907837, + "step": 11500 + }, + { + "ce_loss": 0.16445650160312653, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "distill_loss": 0.16992321610450745, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "ref_ce_loss": 0.10180897265672684, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "loss": 0.6293812394142151, + "step": 11500 + }, + { + "ce_loss": 0.06558717787265778, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "distill_loss": 0.13015815615653992, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "ref_ce_loss": 0.081215038895607, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "loss": 0.9193124175071716, + "step": 11500 + }, + { + "ce_loss": 0.17590521275997162, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "distill_loss": 0.21001935005187988, + "epoch": 3.835890593729153, + "step": 11500 + }, + { + "epoch": 3.835890593729153, + "ref_ce_loss": 0.1322702318429947, + "step": 11500 + }, + { + "epoch": 3.8392261507671783, + "loss": 0.5016, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "grad_norm": 3.171128034591675, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "learning_rate": 0.00016701894439353818, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "loss": 0.40584662556648254, + "step": 11510 + }, + { + "ce_loss": 0.11897286772727966, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "distill_loss": 0.15135160088539124, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "ref_ce_loss": 0.1354077309370041, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "loss": 0.44731903076171875, + "step": 11510 + }, + { + "ce_loss": 0.1088339239358902, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "distill_loss": 0.1577133685350418, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "ref_ce_loss": 0.13546203076839447, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "loss": 0.9254103899002075, + "step": 11510 + }, + { + "ce_loss": 0.18921251595020294, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "distill_loss": 0.1279955953359604, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "ref_ce_loss": 0.11867087334394455, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "loss": 0.41881296038627625, + "step": 11510 + }, + { + "ce_loss": 0.1359606236219406, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "distill_loss": 0.16384638845920563, + "epoch": 3.8392261507671783, + "step": 11510 + }, + { + "epoch": 3.8392261507671783, + "ref_ce_loss": 0.08438153564929962, + "step": 11510 + }, + { + "epoch": 3.8425617078052037, + "loss": 0.5232, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "grad_norm": 3.616981267929077, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "learning_rate": 0.0001668176755309143, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "loss": 0.36008861660957336, + "step": 11520 + }, + { + "ce_loss": 0.09858591854572296, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "distill_loss": 0.13108551502227783, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "ref_ce_loss": 0.10107474774122238, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "loss": 0.4817239046096802, + "step": 11520 + }, + { + "ce_loss": 0.03896396607160568, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "distill_loss": 0.10215901583433151, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "ref_ce_loss": 0.07847031205892563, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "loss": 0.6684030294418335, + "step": 11520 + }, + { + "ce_loss": 0.1818249374628067, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "distill_loss": 0.14205506443977356, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "ref_ce_loss": 0.11002738028764725, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "loss": 0.3725161552429199, + "step": 11520 + }, + { + "ce_loss": 0.12967868149280548, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "distill_loss": 0.14431628584861755, + "epoch": 3.8425617078052037, + "step": 11520 + }, + { + "epoch": 3.8425617078052037, + "ref_ce_loss": 0.0984206423163414, + "step": 11520 + }, + { + "epoch": 3.845897264843229, + "loss": 0.4685, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "grad_norm": 2.334750175476074, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "learning_rate": 0.0001666163759994843, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "loss": 0.6536337733268738, + "step": 11530 + }, + { + "ce_loss": 0.2566821873188019, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "distill_loss": 0.22711533308029175, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "ref_ce_loss": 0.13162018358707428, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "loss": 0.5456714630126953, + "step": 11530 + }, + { + "ce_loss": 0.21901577711105347, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "distill_loss": 0.15249355137348175, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "ref_ce_loss": 0.13759253919124603, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "loss": 0.36008813977241516, + "step": 11530 + }, + { + "ce_loss": 0.10647378861904144, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "distill_loss": 0.09742897003889084, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "ref_ce_loss": 0.10891912877559662, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "loss": 0.4108813405036926, + "step": 11530 + }, + { + "ce_loss": 0.15171436965465546, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "distill_loss": 0.12007388472557068, + "epoch": 3.845897264843229, + "step": 11530 + }, + { + "epoch": 3.845897264843229, + "ref_ce_loss": 0.10404207557439804, + "step": 11530 + }, + { + "epoch": 3.8492328218812544, + "loss": 0.4934, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "grad_norm": 8.927061080932617, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "learning_rate": 0.0001664150461663391, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "loss": 0.371764600276947, + "step": 11540 + }, + { + "ce_loss": 0.10404995083808899, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "distill_loss": 0.1396816223859787, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "ref_ce_loss": 0.09026072919368744, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "loss": 0.470393568277359, + "step": 11540 + }, + { + "ce_loss": 0.11666182428598404, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "distill_loss": 0.15077465772628784, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "ref_ce_loss": 0.10939660668373108, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "loss": 0.6475262641906738, + "step": 11540 + }, + { + "ce_loss": 0.2445875108242035, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "distill_loss": 0.16829434037208557, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "ref_ce_loss": 0.11170588433742523, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "loss": 0.2612134516239166, + "step": 11540 + }, + { + "ce_loss": 0.0813259482383728, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "distill_loss": 0.11056999862194061, + "epoch": 3.8492328218812544, + "step": 11540 + }, + { + "epoch": 3.8492328218812544, + "ref_ce_loss": 0.06922511756420135, + "step": 11540 + }, + { + "epoch": 3.8525683789192797, + "loss": 0.4508, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "grad_norm": 5.26814079284668, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "learning_rate": 0.00016621368639862488, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "loss": 0.5349399447441101, + "step": 11550 + }, + { + "ce_loss": 0.20689332485198975, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "distill_loss": 0.14780738949775696, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "ref_ce_loss": 0.12087158858776093, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "loss": 0.3580425977706909, + "step": 11550 + }, + { + "ce_loss": 0.07902470231056213, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "distill_loss": 0.09948811680078506, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "ref_ce_loss": 0.06858757883310318, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "loss": 0.31841012835502625, + "step": 11550 + }, + { + "ce_loss": 0.09229130297899246, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "distill_loss": 0.15116482973098755, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "ref_ce_loss": 0.07489918172359467, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "loss": 0.351014643907547, + "step": 11550 + }, + { + "ce_loss": 0.12908421456813812, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "distill_loss": 0.1351543664932251, + "epoch": 3.8525683789192797, + "step": 11550 + }, + { + "epoch": 3.8525683789192797, + "ref_ce_loss": 0.0867190733551979, + "step": 11550 + }, + { + "epoch": 3.855903935957305, + "loss": 0.4975, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "grad_norm": 2.962989330291748, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "learning_rate": 0.0001660122970635425, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "loss": 0.5496320128440857, + "step": 11560 + }, + { + "ce_loss": 0.23305082321166992, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "distill_loss": 0.182064026594162, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "ref_ce_loss": 0.10594523698091507, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "loss": 0.5559959411621094, + "step": 11560 + }, + { + "ce_loss": 0.14517226815223694, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "distill_loss": 0.13715030252933502, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "ref_ce_loss": 0.10893628746271133, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "loss": 0.5879688262939453, + "step": 11560 + }, + { + "ce_loss": 0.11704345792531967, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "distill_loss": 0.12446143478155136, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "ref_ce_loss": 0.1276710480451584, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "loss": 0.747494101524353, + "step": 11560 + }, + { + "ce_loss": 0.1367560178041458, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "distill_loss": 0.1948416382074356, + "epoch": 3.855903935957305, + "step": 11560 + }, + { + "epoch": 3.855903935957305, + "ref_ce_loss": 0.12615439295768738, + "step": 11560 + }, + { + "epoch": 3.8592394929953304, + "loss": 0.4883, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "grad_norm": 2.0698843002319336, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "learning_rate": 0.00016581087852834657, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "loss": 0.44442111253738403, + "step": 11570 + }, + { + "ce_loss": 0.08238573372364044, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "distill_loss": 0.14392045140266418, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "ref_ce_loss": 0.08191262930631638, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "loss": 0.490500271320343, + "step": 11570 + }, + { + "ce_loss": 0.15933695435523987, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "distill_loss": 0.13870687782764435, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "ref_ce_loss": 0.0866045281291008, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "loss": 0.5080830454826355, + "step": 11570 + }, + { + "ce_loss": 0.20873354375362396, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "distill_loss": 0.1734309196472168, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "ref_ce_loss": 0.09393316507339478, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "loss": 0.5791904330253601, + "step": 11570 + }, + { + "ce_loss": 0.18548917770385742, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "distill_loss": 0.18491113185882568, + "epoch": 3.8592394929953304, + "step": 11570 + }, + { + "epoch": 3.8592394929953304, + "ref_ce_loss": 0.16175051033496857, + "step": 11570 + }, + { + "epoch": 3.8625750500333558, + "loss": 0.5087, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "grad_norm": 3.098926067352295, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "learning_rate": 0.00016560943116034513, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "loss": 0.49878960847854614, + "step": 11580 + }, + { + "ce_loss": 0.1209196150302887, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "distill_loss": 0.15495194494724274, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "ref_ce_loss": 0.08308865875005722, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "loss": 0.29101717472076416, + "step": 11580 + }, + { + "ce_loss": 0.08006220310926437, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "distill_loss": 0.11955109238624573, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "ref_ce_loss": 0.09133538603782654, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "loss": 0.4208149015903473, + "step": 11580 + }, + { + "ce_loss": 0.04855787381529808, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "distill_loss": 0.1554453819990158, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "ref_ce_loss": 0.07605446130037308, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "loss": 0.42378556728363037, + "step": 11580 + }, + { + "ce_loss": 0.13488538563251495, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "distill_loss": 0.15314099192619324, + "epoch": 3.8625750500333558, + "step": 11580 + }, + { + "epoch": 3.8625750500333558, + "ref_ce_loss": 0.09601394087076187, + "step": 11580 + }, + { + "epoch": 3.865910607071381, + "loss": 0.4683, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "grad_norm": 3.4676589965820312, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "learning_rate": 0.00016540795532689863, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "loss": 0.2266787439584732, + "step": 11590 + }, + { + "ce_loss": 0.06223537027835846, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "distill_loss": 0.08924637734889984, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "ref_ce_loss": 0.05673402175307274, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "loss": 0.5110219120979309, + "step": 11590 + }, + { + "ce_loss": 0.1444280594587326, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "distill_loss": 0.1732235997915268, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "ref_ce_loss": 0.11704862117767334, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "loss": 0.5369408130645752, + "step": 11590 + }, + { + "ce_loss": 0.15495778620243073, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "distill_loss": 0.13318301737308502, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "ref_ce_loss": 0.10845693200826645, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "loss": 0.7540019750595093, + "step": 11590 + }, + { + "ce_loss": 0.2528756558895111, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "distill_loss": 0.19086939096450806, + "epoch": 3.865910607071381, + "step": 11590 + }, + { + "epoch": 3.865910607071381, + "ref_ce_loss": 0.16340744495391846, + "step": 11590 + }, + { + "epoch": 3.8692461641094065, + "loss": 0.5208, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "grad_norm": 2.3811683654785156, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "learning_rate": 0.00016520645139541951, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "loss": 0.5290732979774475, + "step": 11600 + }, + { + "ce_loss": 0.13007256388664246, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "distill_loss": 0.19593222439289093, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "ref_ce_loss": 0.09087681025266647, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "loss": 0.43068817257881165, + "step": 11600 + }, + { + "ce_loss": 0.14082638919353485, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "distill_loss": 0.16190363466739655, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "ref_ce_loss": 0.12784138321876526, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "loss": 0.4996127188205719, + "step": 11600 + }, + { + "ce_loss": 0.15893928706645966, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "distill_loss": 0.23627258837223053, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "ref_ce_loss": 0.1043238639831543, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "loss": 0.44533899426460266, + "step": 11600 + }, + { + "ce_loss": 0.07461967319250107, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "distill_loss": 0.14704746007919312, + "epoch": 3.8692461641094065, + "step": 11600 + }, + { + "epoch": 3.8692461641094065, + "ref_ce_loss": 0.10774028301239014, + "step": 11600 + }, + { + "epoch": 3.872581721147432, + "loss": 0.5007, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "grad_norm": 3.090487003326416, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "learning_rate": 0.00016500491973337158, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "loss": 0.2862015664577484, + "step": 11610 + }, + { + "ce_loss": 0.07175882905721664, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "distill_loss": 0.1443636417388916, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "ref_ce_loss": 0.047852739691734314, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "loss": 0.9817602634429932, + "step": 11610 + }, + { + "ce_loss": 0.11350759863853455, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "distill_loss": 0.14939598739147186, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "ref_ce_loss": 0.10215581208467484, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "loss": 0.5146198272705078, + "step": 11610 + }, + { + "ce_loss": 0.17866657674312592, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "distill_loss": 0.1787855625152588, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "ref_ce_loss": 0.11341854184865952, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "loss": 0.3538992702960968, + "step": 11610 + }, + { + "ce_loss": 0.12631307542324066, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "distill_loss": 0.15863800048828125, + "epoch": 3.872581721147432, + "step": 11610 + }, + { + "epoch": 3.872581721147432, + "ref_ce_loss": 0.06889407336711884, + "step": 11610 + }, + { + "epoch": 3.875917278185457, + "loss": 0.5357, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "grad_norm": 2.8688204288482666, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "learning_rate": 0.00016480336070826904, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "loss": 0.3075914978981018, + "step": 11620 + }, + { + "ce_loss": 0.0304547268897295, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "distill_loss": 0.1418517827987671, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "ref_ce_loss": 0.06853881478309631, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "loss": 0.264699250459671, + "step": 11620 + }, + { + "ce_loss": 0.03863706439733505, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "distill_loss": 0.1346575915813446, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "ref_ce_loss": 0.06331543624401093, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "loss": 0.35955944657325745, + "step": 11620 + }, + { + "ce_loss": 0.0983886793255806, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "distill_loss": 0.13251645863056183, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "ref_ce_loss": 0.053774092346429825, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "loss": 0.3828924000263214, + "step": 11620 + }, + { + "ce_loss": 0.0748138353228569, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "distill_loss": 0.155991330742836, + "epoch": 3.875917278185457, + "step": 11620 + }, + { + "epoch": 3.875917278185457, + "ref_ce_loss": 0.11473026126623154, + "step": 11620 + }, + { + "epoch": 3.8792528352234825, + "loss": 0.5052, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "grad_norm": 2.6132733821868896, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "learning_rate": 0.00016460177468767588, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "loss": 0.7341196537017822, + "step": 11630 + }, + { + "ce_loss": 0.1839066743850708, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "distill_loss": 0.20775669813156128, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "ref_ce_loss": 0.15465961396694183, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "loss": 0.6776058673858643, + "step": 11630 + }, + { + "ce_loss": 0.13776738941669464, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "distill_loss": 0.1728818416595459, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "ref_ce_loss": 0.09529329836368561, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "loss": 0.44648653268814087, + "step": 11630 + }, + { + "ce_loss": 0.1300860196352005, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "distill_loss": 0.1513717770576477, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "ref_ce_loss": 0.12261359393596649, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "loss": 0.6910187005996704, + "step": 11630 + }, + { + "ce_loss": 0.2088298350572586, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "distill_loss": 0.2162533700466156, + "epoch": 3.8792528352234825, + "step": 11630 + }, + { + "epoch": 3.8792528352234825, + "ref_ce_loss": 0.13651823997497559, + "step": 11630 + }, + { + "epoch": 3.882588392261508, + "loss": 0.5367, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "grad_norm": 3.5043728351593018, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "learning_rate": 0.00016440016203920574, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "loss": 0.4661126136779785, + "step": 11640 + }, + { + "ce_loss": 0.09211700409650803, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "distill_loss": 0.19817857444286346, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "ref_ce_loss": 0.08246038854122162, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "loss": 0.47072139382362366, + "step": 11640 + }, + { + "ce_loss": 0.1250084787607193, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "distill_loss": 0.25739413499832153, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "ref_ce_loss": 0.08818263560533524, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "loss": 0.4213848412036896, + "step": 11640 + }, + { + "ce_loss": 0.12245972454547882, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "distill_loss": 0.18493112921714783, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "ref_ce_loss": 0.11392603069543839, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "loss": 0.8170498013496399, + "step": 11640 + }, + { + "ce_loss": 0.12693078815937042, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "distill_loss": 0.21718522906303406, + "epoch": 3.882588392261508, + "step": 11640 + }, + { + "epoch": 3.882588392261508, + "ref_ce_loss": 0.09534420818090439, + "step": 11640 + }, + { + "epoch": 3.885923949299533, + "loss": 0.5095, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "grad_norm": 3.2084357738494873, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "learning_rate": 0.00016419852313052043, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "loss": 0.4158484637737274, + "step": 11650 + }, + { + "ce_loss": 0.12239982187747955, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "distill_loss": 0.13823160529136658, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "ref_ce_loss": 0.11193165183067322, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "loss": 0.371078759431839, + "step": 11650 + }, + { + "ce_loss": 0.12052211910486221, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "distill_loss": 0.14332422614097595, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "ref_ce_loss": 0.0856185331940651, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "loss": 0.4371638298034668, + "step": 11650 + }, + { + "ce_loss": 0.16073602437973022, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "distill_loss": 0.1462903916835785, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "ref_ce_loss": 0.09076157212257385, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "loss": 0.47583967447280884, + "step": 11650 + }, + { + "ce_loss": 0.13007958233356476, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "distill_loss": 0.19589829444885254, + "epoch": 3.885923949299533, + "step": 11650 + }, + { + "epoch": 3.885923949299533, + "ref_ce_loss": 0.10973778367042542, + "step": 11650 + }, + { + "epoch": 3.8892595063375586, + "loss": 0.4876, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "grad_norm": 2.6360435485839844, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "learning_rate": 0.00016399685832932975, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "loss": 0.504503071308136, + "step": 11660 + }, + { + "ce_loss": 0.11461180448532104, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "distill_loss": 0.18167349696159363, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "ref_ce_loss": 0.0868772566318512, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "loss": 0.705942690372467, + "step": 11660 + }, + { + "ce_loss": 0.20406785607337952, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "distill_loss": 0.16648991405963898, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "ref_ce_loss": 0.12941238284111023, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "loss": 0.31773361563682556, + "step": 11660 + }, + { + "ce_loss": 0.10134835541248322, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "distill_loss": 0.13180075585842133, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "ref_ce_loss": 0.0696868896484375, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "loss": 0.33874279260635376, + "step": 11660 + }, + { + "ce_loss": 0.07513560354709625, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "distill_loss": 0.10537146776914597, + "epoch": 3.8892595063375586, + "step": 11660 + }, + { + "epoch": 3.8892595063375586, + "ref_ce_loss": 0.09582258760929108, + "step": 11660 + }, + { + "epoch": 3.892595063375584, + "loss": 0.5061, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "grad_norm": 2.3636677265167236, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "learning_rate": 0.0001637951680033908, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "loss": 0.35305699706077576, + "step": 11670 + }, + { + "ce_loss": 0.11162156611680984, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "distill_loss": 0.11027735471725464, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "ref_ce_loss": 0.0959223136305809, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "loss": 0.4801453948020935, + "step": 11670 + }, + { + "ce_loss": 0.11221076548099518, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "distill_loss": 0.16410060226917267, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "ref_ce_loss": 0.08730052411556244, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "loss": 0.4774989187717438, + "step": 11670 + }, + { + "ce_loss": 0.06793775409460068, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "distill_loss": 0.1663445234298706, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "ref_ce_loss": 0.16433344781398773, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "loss": 0.5329448580741882, + "step": 11670 + }, + { + "ce_loss": 0.14226965606212616, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "distill_loss": 0.15646116435527802, + "epoch": 3.892595063375584, + "step": 11670 + }, + { + "epoch": 3.892595063375584, + "ref_ce_loss": 0.08196160942316055, + "step": 11670 + }, + { + "epoch": 3.8959306204136093, + "loss": 0.4525, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "grad_norm": 3.344223737716675, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "learning_rate": 0.0001635934525205072, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "loss": 0.378836065530777, + "step": 11680 + }, + { + "ce_loss": 0.09642636775970459, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "distill_loss": 0.14807581901550293, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "ref_ce_loss": 0.07628420740365982, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "loss": 0.42805588245391846, + "step": 11680 + }, + { + "ce_loss": 0.06449296325445175, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "distill_loss": 0.1382300704717636, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "ref_ce_loss": 0.08380676805973053, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "loss": 1.1077532768249512, + "step": 11680 + }, + { + "ce_loss": 0.2406988888978958, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "distill_loss": 0.21715620160102844, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "ref_ce_loss": 0.08214037120342255, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "loss": 0.6784728169441223, + "step": 11680 + }, + { + "ce_loss": 0.2056596726179123, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "distill_loss": 0.18610471487045288, + "epoch": 3.8959306204136093, + "step": 11680 + }, + { + "epoch": 3.8959306204136093, + "ref_ce_loss": 0.11367989331483841, + "step": 11680 + }, + { + "epoch": 3.8992661774516346, + "loss": 0.5363, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "grad_norm": 3.669268846511841, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "learning_rate": 0.00016339171224852834, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "loss": 0.5375806093215942, + "step": 11690 + }, + { + "ce_loss": 0.2153269350528717, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "distill_loss": 0.19027647376060486, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "ref_ce_loss": 0.13186945021152496, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "loss": 0.43219420313835144, + "step": 11690 + }, + { + "ce_loss": 0.15259014070034027, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "distill_loss": 0.16870765388011932, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "ref_ce_loss": 0.11080259084701538, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "loss": 0.4578809142112732, + "step": 11690 + }, + { + "ce_loss": 0.11323428899049759, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "distill_loss": 0.1490565836429596, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "ref_ce_loss": 0.0648016631603241, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "loss": 0.6001131534576416, + "step": 11690 + }, + { + "ce_loss": 0.1150934249162674, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "distill_loss": 0.12322366237640381, + "epoch": 3.8992661774516346, + "step": 11690 + }, + { + "epoch": 3.8992661774516346, + "ref_ce_loss": 0.07318437844514847, + "step": 11690 + }, + { + "epoch": 3.90260173448966, + "loss": 0.4775, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "grad_norm": 2.9176535606384277, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "learning_rate": 0.00016318994755534894, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "loss": 0.21333375573158264, + "step": 11700 + }, + { + "ce_loss": 0.04972422122955322, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "distill_loss": 0.09933877736330032, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "ref_ce_loss": 0.06423191726207733, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "loss": 0.4005766212940216, + "step": 11700 + }, + { + "ce_loss": 0.1483602076768875, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "distill_loss": 0.14177684485912323, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "ref_ce_loss": 0.08312007784843445, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "loss": 0.4453301429748535, + "step": 11700 + }, + { + "ce_loss": 0.13142706453800201, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "distill_loss": 0.19013100862503052, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "ref_ce_loss": 0.0937122255563736, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "loss": 0.4367537200450897, + "step": 11700 + }, + { + "ce_loss": 0.1704091727733612, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "distill_loss": 0.14347653090953827, + "epoch": 3.90260173448966, + "step": 11700 + }, + { + "epoch": 3.90260173448966, + "ref_ce_loss": 0.07881513237953186, + "step": 11700 + }, + { + "epoch": 3.9059372915276853, + "loss": 0.4818, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "grad_norm": 2.7853381633758545, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "learning_rate": 0.00016298815880890822, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "loss": 0.2837199568748474, + "step": 11710 + }, + { + "ce_loss": 0.06674331426620483, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "distill_loss": 0.14998966455459595, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "ref_ce_loss": 0.06695345044136047, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "loss": 0.42292481660842896, + "step": 11710 + }, + { + "ce_loss": 0.1149093508720398, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "distill_loss": 0.14028172194957733, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "ref_ce_loss": 0.11760975420475006, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "loss": 0.424089252948761, + "step": 11710 + }, + { + "ce_loss": 0.06741020083427429, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "distill_loss": 0.15840831398963928, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "ref_ce_loss": 0.11006693542003632, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "loss": 0.3766666650772095, + "step": 11710 + }, + { + "ce_loss": 0.04787565395236015, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "distill_loss": 0.1054430678486824, + "epoch": 3.9059372915276853, + "step": 11710 + }, + { + "epoch": 3.9059372915276853, + "ref_ce_loss": 0.07436710596084595, + "step": 11710 + }, + { + "epoch": 3.9092728485657107, + "loss": 0.4974, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "grad_norm": 2.3894925117492676, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "learning_rate": 0.00016278634637718922, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "loss": 0.41334080696105957, + "step": 11720 + }, + { + "ce_loss": 0.08776438236236572, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "distill_loss": 0.14124101400375366, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "ref_ce_loss": 0.09219489991664886, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "loss": 0.42992448806762695, + "step": 11720 + }, + { + "ce_loss": 0.09458998590707779, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "distill_loss": 0.18413154780864716, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "ref_ce_loss": 0.15011750161647797, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "loss": 0.3641749918460846, + "step": 11720 + }, + { + "ce_loss": 0.12528592348098755, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "distill_loss": 0.13670697808265686, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "ref_ce_loss": 0.07176109403371811, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "loss": 0.36159613728523254, + "step": 11720 + }, + { + "ce_loss": 0.12030640989542007, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "distill_loss": 0.14727510511875153, + "epoch": 3.9092728485657107, + "step": 11720 + }, + { + "epoch": 3.9092728485657107, + "ref_ce_loss": 0.0937834084033966, + "step": 11720 + }, + { + "epoch": 3.912608405603736, + "loss": 0.4675, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "grad_norm": 3.769256830215454, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "learning_rate": 0.00016258451062821827, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "loss": 0.2411087602376938, + "step": 11730 + }, + { + "ce_loss": 0.0759025439620018, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "distill_loss": 0.09947311878204346, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "ref_ce_loss": 0.04836536571383476, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "loss": 0.8758841753005981, + "step": 11730 + }, + { + "ce_loss": 0.1558084785938263, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "distill_loss": 0.11926652491092682, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "ref_ce_loss": 0.16222594678401947, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "loss": 0.36194390058517456, + "step": 11730 + }, + { + "ce_loss": 0.11282184720039368, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "distill_loss": 0.15627947449684143, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "ref_ce_loss": 0.09275899082422256, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "loss": 0.5852751731872559, + "step": 11730 + }, + { + "ce_loss": 0.10109452903270721, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "distill_loss": 0.15885433554649353, + "epoch": 3.912608405603736, + "step": 11730 + }, + { + "epoch": 3.912608405603736, + "ref_ce_loss": 0.12536212801933289, + "step": 11730 + }, + { + "epoch": 3.9159439626417614, + "loss": 0.4803, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "grad_norm": 3.014451265335083, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "learning_rate": 0.0001623826519300641, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "loss": 0.3135759234428406, + "step": 11740 + }, + { + "ce_loss": 0.09415557235479355, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "distill_loss": 0.11723636090755463, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "ref_ce_loss": 0.06273960322141647, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "loss": 0.30374178290367126, + "step": 11740 + }, + { + "ce_loss": 0.10108528286218643, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "distill_loss": 0.11091496795415878, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "ref_ce_loss": 0.0914100706577301, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "loss": 0.6572458744049072, + "step": 11740 + }, + { + "ce_loss": 0.1793697476387024, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "distill_loss": 0.14189916849136353, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "ref_ce_loss": 0.09885284304618835, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "loss": 0.43537265062332153, + "step": 11740 + }, + { + "ce_loss": 0.1796526312828064, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "distill_loss": 0.13787336647510529, + "epoch": 3.9159439626417614, + "step": 11740 + }, + { + "epoch": 3.9159439626417614, + "ref_ce_loss": 0.08680669963359833, + "step": 11740 + }, + { + "epoch": 3.9192795196797867, + "loss": 0.4401, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "grad_norm": 2.1687493324279785, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "learning_rate": 0.00016218077065083736, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "loss": 0.3428560793399811, + "step": 11750 + }, + { + "ce_loss": 0.052370764315128326, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "distill_loss": 0.0916663259267807, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "ref_ce_loss": 0.09654126316308975, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "loss": 0.3783106505870819, + "step": 11750 + }, + { + "ce_loss": 0.09852546453475952, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "distill_loss": 0.11007370054721832, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "ref_ce_loss": 0.10336705297231674, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "loss": 0.4215086102485657, + "step": 11750 + }, + { + "ce_loss": 0.11413310468196869, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "distill_loss": 0.1096954271197319, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "ref_ce_loss": 0.13159877061843872, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "loss": 0.4675275981426239, + "step": 11750 + }, + { + "ce_loss": 0.16325342655181885, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "distill_loss": 0.14663267135620117, + "epoch": 3.9192795196797867, + "step": 11750 + }, + { + "epoch": 3.9192795196797867, + "ref_ce_loss": 0.15452298521995544, + "step": 11750 + }, + { + "epoch": 3.922615076717812, + "loss": 0.4391, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "grad_norm": 5.820791721343994, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "learning_rate": 0.00016197886715868987, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "loss": 0.3936474621295929, + "step": 11760 + }, + { + "ce_loss": 0.11980064958333969, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "distill_loss": 0.09972325712442398, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "ref_ce_loss": 0.09936317801475525, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "loss": 0.42065149545669556, + "step": 11760 + }, + { + "ce_loss": 0.0961647778749466, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "distill_loss": 0.11633703112602234, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "ref_ce_loss": 0.09963429719209671, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "loss": 0.4702761173248291, + "step": 11760 + }, + { + "ce_loss": 0.14327195286750793, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "distill_loss": 0.13244634866714478, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "ref_ce_loss": 0.11447618156671524, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "loss": 0.4945146441459656, + "step": 11760 + }, + { + "ce_loss": 0.038211580365896225, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "distill_loss": 0.07644416391849518, + "epoch": 3.922615076717812, + "step": 11760 + }, + { + "epoch": 3.922615076717812, + "ref_ce_loss": 0.07319790124893188, + "step": 11760 + }, + { + "epoch": 3.9259506337558374, + "loss": 0.5131, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "grad_norm": 2.2660369873046875, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "learning_rate": 0.00016177694182181396, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "loss": 0.4388217031955719, + "step": 11770 + }, + { + "ce_loss": 0.08850383013486862, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "distill_loss": 0.18210861086845398, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "ref_ce_loss": 0.1255296766757965, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "loss": 0.6527267694473267, + "step": 11770 + }, + { + "ce_loss": 0.1932949721813202, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "distill_loss": 0.12827904522418976, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "ref_ce_loss": 0.12352578341960907, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "loss": 0.2734456956386566, + "step": 11770 + }, + { + "ce_loss": 0.035691630095243454, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "distill_loss": 0.08057143539190292, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "ref_ce_loss": 0.0883350819349289, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "loss": 0.6890462040901184, + "step": 11770 + }, + { + "ce_loss": 0.2692700922489166, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "distill_loss": 0.17551743984222412, + "epoch": 3.9259506337558374, + "step": 11770 + }, + { + "epoch": 3.9259506337558374, + "ref_ce_loss": 0.1092531830072403, + "step": 11770 + }, + { + "epoch": 3.9292861907938628, + "loss": 0.4957, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "grad_norm": 3.0138349533081055, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "learning_rate": 0.00016157499500844182, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "loss": 0.3256349563598633, + "step": 11780 + }, + { + "ce_loss": 0.08072438836097717, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "distill_loss": 0.12202650308609009, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "ref_ce_loss": 0.12261930853128433, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "loss": 0.28612565994262695, + "step": 11780 + }, + { + "ce_loss": 0.08246175199747086, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "distill_loss": 0.11741185188293457, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "ref_ce_loss": 0.08588752150535583, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "loss": 0.36774736642837524, + "step": 11780 + }, + { + "ce_loss": 0.11784758418798447, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "distill_loss": 0.13903403282165527, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "ref_ce_loss": 0.11045124381780624, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "loss": 0.2915292978286743, + "step": 11780 + }, + { + "ce_loss": 0.07259783893823624, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "distill_loss": 0.09900534152984619, + "epoch": 3.9292861907938628, + "step": 11780 + }, + { + "epoch": 3.9292861907938628, + "ref_ce_loss": 0.05908788740634918, + "step": 11780 + }, + { + "epoch": 3.932621747831888, + "loss": 0.4541, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "grad_norm": 2.875215530395508, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "learning_rate": 0.00016137302708684476, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "loss": 0.35691261291503906, + "step": 11790 + }, + { + "ce_loss": 0.1520719975233078, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "distill_loss": 0.11623502522706985, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "ref_ce_loss": 0.08838573843240738, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "loss": 0.3730715811252594, + "step": 11790 + }, + { + "ce_loss": 0.13004635274410248, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "distill_loss": 0.10884232074022293, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "ref_ce_loss": 0.100712850689888, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "loss": 0.2646164000034332, + "step": 11790 + }, + { + "ce_loss": 0.10139144957065582, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "distill_loss": 0.10436727106571198, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "ref_ce_loss": 0.05834803357720375, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "loss": 0.48992589116096497, + "step": 11790 + }, + { + "ce_loss": 0.1954546868801117, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "distill_loss": 0.17859971523284912, + "epoch": 3.932621747831888, + "step": 11790 + }, + { + "epoch": 3.932621747831888, + "ref_ce_loss": 0.11544232815504074, + "step": 11790 + }, + { + "epoch": 3.9359573048699135, + "loss": 0.5165, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "grad_norm": 4.251229286193848, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "learning_rate": 0.00016117103842533254, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "loss": 0.35116198658943176, + "step": 11800 + }, + { + "ce_loss": 0.11046728491783142, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "distill_loss": 0.1407284140586853, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "ref_ce_loss": 0.09850616753101349, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "loss": 0.6646789908409119, + "step": 11800 + }, + { + "ce_loss": 0.20312164723873138, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "distill_loss": 0.20929650962352753, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "ref_ce_loss": 0.12372786551713943, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "loss": 0.2903001010417938, + "step": 11800 + }, + { + "ce_loss": 0.05333415046334267, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "distill_loss": 0.12783510982990265, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "ref_ce_loss": 0.060100555419921875, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "loss": 0.3517928123474121, + "step": 11800 + }, + { + "ce_loss": 0.07974269986152649, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "distill_loss": 0.1284874528646469, + "epoch": 3.9359573048699135, + "step": 11800 + }, + { + "epoch": 3.9359573048699135, + "ref_ce_loss": 0.06259086728096008, + "step": 11800 + }, + { + "epoch": 3.939292861907939, + "loss": 0.4735, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "grad_norm": 2.0716724395751953, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "learning_rate": 0.00016096902939225283, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "loss": 0.47878918051719666, + "step": 11810 + }, + { + "ce_loss": 0.20008765161037445, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "distill_loss": 0.1378183811903, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "ref_ce_loss": 0.12045397609472275, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "loss": 0.6673761606216431, + "step": 11810 + }, + { + "ce_loss": 0.1397218108177185, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "distill_loss": 0.14506368339061737, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "ref_ce_loss": 0.13423040509223938, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "loss": 0.5018212199211121, + "step": 11810 + }, + { + "ce_loss": 0.1881476491689682, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "distill_loss": 0.1714666485786438, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "ref_ce_loss": 0.09688962996006012, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "loss": 0.4474361836910248, + "step": 11810 + }, + { + "ce_loss": 0.1412636637687683, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "distill_loss": 0.17755410075187683, + "epoch": 3.939292861907939, + "step": 11810 + }, + { + "epoch": 3.939292861907939, + "ref_ce_loss": 0.09456686675548553, + "step": 11810 + }, + { + "epoch": 3.942628418945964, + "loss": 0.4634, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "grad_norm": 3.9476706981658936, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "learning_rate": 0.00016076700035599052, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "loss": 0.7122544050216675, + "step": 11820 + }, + { + "ce_loss": 0.2613614499568939, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "distill_loss": 0.2061549425125122, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "ref_ce_loss": 0.1448051482439041, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "loss": 0.6049351096153259, + "step": 11820 + }, + { + "ce_loss": 0.22536680102348328, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "distill_loss": 0.24145212769508362, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "ref_ce_loss": 0.1379050463438034, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "loss": 0.4970494508743286, + "step": 11820 + }, + { + "ce_loss": 0.1067628264427185, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "distill_loss": 0.14002716541290283, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "ref_ce_loss": 0.08016132563352585, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "loss": 0.35276898741722107, + "step": 11820 + }, + { + "ce_loss": 0.12041664868593216, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "distill_loss": 0.1190585047006607, + "epoch": 3.942628418945964, + "step": 11820 + }, + { + "epoch": 3.942628418945964, + "ref_ce_loss": 0.0713760107755661, + "step": 11820 + }, + { + "epoch": 3.9459639759839895, + "loss": 0.4898, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "grad_norm": 3.2094149589538574, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "learning_rate": 0.0001605649516849667, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "loss": 0.4047658443450928, + "step": 11830 + }, + { + "ce_loss": 0.08828526735305786, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "distill_loss": 0.1679137498140335, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "ref_ce_loss": 0.07592868059873581, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "loss": 0.38525718450546265, + "step": 11830 + }, + { + "ce_loss": 0.1573115736246109, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "distill_loss": 0.14555348455905914, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "ref_ce_loss": 0.08227339386940002, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "loss": 0.6348785161972046, + "step": 11830 + }, + { + "ce_loss": 0.20729894936084747, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "distill_loss": 0.16912637650966644, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "ref_ce_loss": 0.1757945418357849, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "loss": 0.6446617841720581, + "step": 11830 + }, + { + "ce_loss": 0.216706320643425, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "distill_loss": 0.17024075984954834, + "epoch": 3.9459639759839895, + "step": 11830 + }, + { + "epoch": 3.9459639759839895, + "ref_ce_loss": 0.1002252846956253, + "step": 11830 + }, + { + "epoch": 3.949299533022015, + "loss": 0.5071, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "grad_norm": 3.606684684753418, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "learning_rate": 0.00016036288374763862, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "loss": 0.4314247667789459, + "step": 11840 + }, + { + "ce_loss": 0.14194534718990326, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "distill_loss": 0.1760086715221405, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "ref_ce_loss": 0.11337415874004364, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "loss": 0.5289191007614136, + "step": 11840 + }, + { + "ce_loss": 0.1298551708459854, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "distill_loss": 0.16319157183170319, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "ref_ce_loss": 0.10791688412427902, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "loss": 0.5899083018302917, + "step": 11840 + }, + { + "ce_loss": 0.10379697382450104, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "distill_loss": 0.1373264640569687, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "ref_ce_loss": 0.09423581510782242, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "loss": 0.4148091673851013, + "step": 11840 + }, + { + "ce_loss": 0.11106943339109421, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "distill_loss": 0.17882095277309418, + "epoch": 3.949299533022015, + "step": 11840 + }, + { + "epoch": 3.949299533022015, + "ref_ce_loss": 0.12472303211688995, + "step": 11840 + }, + { + "epoch": 3.95263509006004, + "loss": 0.5228, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "grad_norm": 2.477299451828003, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "learning_rate": 0.00016016079691249835, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "loss": 0.30184489488601685, + "step": 11850 + }, + { + "ce_loss": 0.08025673031806946, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "distill_loss": 0.13255445659160614, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "ref_ce_loss": 0.05937567353248596, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "loss": 0.9538842439651489, + "step": 11850 + }, + { + "ce_loss": 0.22645705938339233, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "distill_loss": 0.29409679770469666, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "ref_ce_loss": 0.15761788189411163, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "loss": 0.5301204919815063, + "step": 11850 + }, + { + "ce_loss": 0.18444590270519257, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "distill_loss": 0.179538756608963, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "ref_ce_loss": 0.1306769698858261, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "loss": 0.36008432507514954, + "step": 11850 + }, + { + "ce_loss": 0.09907856583595276, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "distill_loss": 0.15818718075752258, + "epoch": 3.95263509006004, + "step": 11850 + }, + { + "epoch": 3.95263509006004, + "ref_ce_loss": 0.0713249072432518, + "step": 11850 + }, + { + "epoch": 3.9559706470980656, + "loss": 0.4798, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "grad_norm": 2.704888105392456, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "learning_rate": 0.00015995869154807266, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "loss": 0.42570406198501587, + "step": 11860 + }, + { + "ce_loss": 0.13900217413902283, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "distill_loss": 0.1536346822977066, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "ref_ce_loss": 0.07783767580986023, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "loss": 0.4678839147090912, + "step": 11860 + }, + { + "ce_loss": 0.09099794179201126, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "distill_loss": 0.22181285917758942, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "ref_ce_loss": 0.11604619771242142, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "loss": 0.461051881313324, + "step": 11860 + }, + { + "ce_loss": 0.09864752739667892, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "distill_loss": 0.09917500615119934, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "ref_ce_loss": 0.09517745673656464, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "loss": 0.5379061102867126, + "step": 11860 + }, + { + "ce_loss": 0.13948775827884674, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "distill_loss": 0.2097395360469818, + "epoch": 3.9559706470980656, + "step": 11860 + }, + { + "epoch": 3.9559706470980656, + "ref_ce_loss": 0.13162483274936676, + "step": 11860 + }, + { + "epoch": 3.959306204136091, + "loss": 0.5352, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "grad_norm": 3.3357937335968018, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "learning_rate": 0.00015975656802292196, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "loss": 0.5465363264083862, + "step": 11870 + }, + { + "ce_loss": 0.10335254669189453, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "distill_loss": 0.22682029008865356, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "ref_ce_loss": 0.119485042989254, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "loss": 0.3730747401714325, + "step": 11870 + }, + { + "ce_loss": 0.09395212680101395, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "distill_loss": 0.19650858640670776, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "ref_ce_loss": 0.08227689564228058, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "loss": 0.5724227428436279, + "step": 11870 + }, + { + "ce_loss": 0.07213887572288513, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "distill_loss": 0.165091872215271, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "ref_ce_loss": 0.07907076925039291, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "loss": 0.9376710653305054, + "step": 11870 + }, + { + "ce_loss": 0.08496998250484467, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "distill_loss": 0.18652302026748657, + "epoch": 3.959306204136091, + "step": 11870 + }, + { + "epoch": 3.959306204136091, + "ref_ce_loss": 0.10293760895729065, + "step": 11870 + }, + { + "epoch": 3.9626417611741163, + "loss": 0.5433, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "grad_norm": 3.060880661010742, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "learning_rate": 0.00015955442670563983, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "loss": 0.38805532455444336, + "step": 11880 + }, + { + "ce_loss": 0.0926336944103241, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "distill_loss": 0.1631380021572113, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "ref_ce_loss": 0.10547516494989395, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "loss": 0.4460315704345703, + "step": 11880 + }, + { + "ce_loss": 0.13091424107551575, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "distill_loss": 0.19473236799240112, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "ref_ce_loss": 0.0909007340669632, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "loss": 0.6728657484054565, + "step": 11880 + }, + { + "ce_loss": 0.10651903599500656, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "distill_loss": 0.186860591173172, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "ref_ce_loss": 0.0980074554681778, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "loss": 0.4039444327354431, + "step": 11880 + }, + { + "ce_loss": 0.07333799451589584, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "distill_loss": 0.21126312017440796, + "epoch": 3.9626417611741163, + "step": 11880 + }, + { + "epoch": 3.9626417611741163, + "ref_ce_loss": 0.08966390788555145, + "step": 11880 + }, + { + "epoch": 3.9659773182121416, + "loss": 0.4965, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "grad_norm": 3.569099187850952, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "learning_rate": 0.00015935226796485227, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "loss": 0.5312309265136719, + "step": 11890 + }, + { + "ce_loss": 0.11842554807662964, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "distill_loss": 0.20231810212135315, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "ref_ce_loss": 0.09051153063774109, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "loss": 0.3848481774330139, + "step": 11890 + }, + { + "ce_loss": 0.12863628566265106, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "distill_loss": 0.1335049271583557, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "ref_ce_loss": 0.12259049713611603, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "loss": 0.3410494923591614, + "step": 11890 + }, + { + "ce_loss": 0.10201102495193481, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "distill_loss": 0.15431654453277588, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "ref_ce_loss": 0.08442019671201706, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "loss": 0.4434935748577118, + "step": 11890 + }, + { + "ce_loss": 0.14288806915283203, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "distill_loss": 0.17208629846572876, + "epoch": 3.9659773182121416, + "step": 11890 + }, + { + "epoch": 3.9659773182121416, + "ref_ce_loss": 0.10217718780040741, + "step": 11890 + }, + { + "epoch": 3.969312875250167, + "loss": 0.5087, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "grad_norm": 2.463893413543701, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "learning_rate": 0.00015915009216921716, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "loss": 0.9179453253746033, + "step": 11900 + }, + { + "ce_loss": 0.14963048696517944, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "distill_loss": 0.4440215528011322, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "ref_ce_loss": 0.171931654214859, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "loss": 0.636218786239624, + "step": 11900 + }, + { + "ce_loss": 0.16180500388145447, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "distill_loss": 0.17104534804821014, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "ref_ce_loss": 0.12259872257709503, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "loss": 0.8462910652160645, + "step": 11900 + }, + { + "ce_loss": 0.14126595854759216, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "distill_loss": 0.21537429094314575, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "ref_ce_loss": 0.15868441760540009, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "loss": 0.6787412166595459, + "step": 11900 + }, + { + "ce_loss": 0.2031691074371338, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "distill_loss": 0.15440312027931213, + "epoch": 3.969312875250167, + "step": 11900 + }, + { + "epoch": 3.969312875250167, + "ref_ce_loss": 0.12730467319488525, + "step": 11900 + }, + { + "epoch": 3.9726484322881923, + "loss": 0.5596, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "grad_norm": 4.980892658233643, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "learning_rate": 0.0001589478996874233, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "loss": 0.5536061525344849, + "step": 11910 + }, + { + "ce_loss": 0.15429961681365967, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "distill_loss": 0.2155410647392273, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "ref_ce_loss": 0.12953603267669678, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "loss": 0.4384726881980896, + "step": 11910 + }, + { + "ce_loss": 0.10884366929531097, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "distill_loss": 0.17421254515647888, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "ref_ce_loss": 0.10366030037403107, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "loss": 0.3859587013721466, + "step": 11910 + }, + { + "ce_loss": 0.08458442240953445, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "distill_loss": 0.12382687628269196, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "ref_ce_loss": 0.09032014012336731, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "loss": 0.6750946044921875, + "step": 11910 + }, + { + "ce_loss": 0.2623468339443207, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "distill_loss": 0.180451437830925, + "epoch": 3.9726484322881923, + "step": 11910 + }, + { + "epoch": 3.9726484322881923, + "ref_ce_loss": 0.1338099092245102, + "step": 11910 + }, + { + "epoch": 3.9759839893262177, + "loss": 0.5104, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "grad_norm": 3.3186306953430176, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "learning_rate": 0.00015874569088819015, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "loss": 0.4923209249973297, + "step": 11920 + }, + { + "ce_loss": 0.13076621294021606, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "distill_loss": 0.14761069416999817, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "ref_ce_loss": 0.15832741558551788, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "loss": 0.2980753183364868, + "step": 11920 + }, + { + "ce_loss": 0.08580087870359421, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "distill_loss": 0.15114721655845642, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "ref_ce_loss": 0.060876086354255676, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "loss": 0.5571591854095459, + "step": 11920 + }, + { + "ce_loss": 0.15017174184322357, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "distill_loss": 0.22002549469470978, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "ref_ce_loss": 0.09577220678329468, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "loss": 0.5138970017433167, + "step": 11920 + }, + { + "ce_loss": 0.18627305328845978, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "distill_loss": 0.1946372538805008, + "epoch": 3.9759839893262177, + "step": 11920 + }, + { + "epoch": 3.9759839893262177, + "ref_ce_loss": 0.0819023847579956, + "step": 11920 + }, + { + "epoch": 3.979319546364243, + "loss": 0.5138, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "grad_norm": 2.3914072513580322, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "learning_rate": 0.0001585434661402667, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "loss": 0.3897310495376587, + "step": 11930 + }, + { + "ce_loss": 0.09720531105995178, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "distill_loss": 0.16733823716640472, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "ref_ce_loss": 0.1047656387090683, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "loss": 0.5685856938362122, + "step": 11930 + }, + { + "ce_loss": 0.14480598270893097, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "distill_loss": 0.1961873173713684, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "ref_ce_loss": 0.11797832697629929, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "loss": 0.5430389642715454, + "step": 11930 + }, + { + "ce_loss": 0.16561175882816315, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "distill_loss": 0.19522586464881897, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "ref_ce_loss": 0.1352234035730362, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "loss": 0.5040985345840454, + "step": 11930 + }, + { + "ce_loss": 0.13989169895648956, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "distill_loss": 0.13456882536411285, + "epoch": 3.979319546364243, + "step": 11930 + }, + { + "epoch": 3.979319546364243, + "ref_ce_loss": 0.10855761915445328, + "step": 11930 + }, + { + "epoch": 3.9826551034022684, + "loss": 0.4939, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "grad_norm": 3.0207884311676025, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "learning_rate": 0.00015834122581243103, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "loss": 0.5423541069030762, + "step": 11940 + }, + { + "ce_loss": 0.11814697831869125, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "distill_loss": 0.15340958535671234, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "ref_ce_loss": 0.10886798053979874, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "loss": 0.5537235140800476, + "step": 11940 + }, + { + "ce_loss": 0.2218320369720459, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "distill_loss": 0.19175180792808533, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "ref_ce_loss": 0.10942274332046509, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "loss": 0.5389870405197144, + "step": 11940 + }, + { + "ce_loss": 0.11036817729473114, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "distill_loss": 0.18006466329097748, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "ref_ce_loss": 0.12648382782936096, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "loss": 0.6045240163803101, + "step": 11940 + }, + { + "ce_loss": 0.21640561521053314, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "distill_loss": 0.208718404173851, + "epoch": 3.9826551034022684, + "step": 11940 + }, + { + "epoch": 3.9826551034022684, + "ref_ce_loss": 0.08881595730781555, + "step": 11940 + }, + { + "epoch": 3.9859906604402937, + "loss": 0.5379, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "grad_norm": 3.2304489612579346, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "learning_rate": 0.00015813897027348989, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "loss": 0.2578718364238739, + "step": 11950 + }, + { + "ce_loss": 0.06574041396379471, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "distill_loss": 0.11338651180267334, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "ref_ce_loss": 0.05241406708955765, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "loss": 0.2795087397098541, + "step": 11950 + }, + { + "ce_loss": 0.031032180413603783, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "distill_loss": 0.11463324725627899, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "ref_ce_loss": 0.08366985619068146, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "loss": 0.6945253610610962, + "step": 11950 + }, + { + "ce_loss": 0.22229063510894775, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "distill_loss": 0.18065792322158813, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "ref_ce_loss": 0.09593914449214935, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "loss": 0.4455210268497467, + "step": 11950 + }, + { + "ce_loss": 0.17479459941387177, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "distill_loss": 0.18478310108184814, + "epoch": 3.9859906604402937, + "step": 11950 + }, + { + "epoch": 3.9859906604402937, + "ref_ce_loss": 0.08535302430391312, + "step": 11950 + }, + { + "epoch": 3.989326217478319, + "loss": 0.546, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "grad_norm": 2.468493938446045, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "learning_rate": 0.00015793669989227758, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "loss": 0.5372409820556641, + "step": 11960 + }, + { + "ce_loss": 0.14682519435882568, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "distill_loss": 0.23497946560382843, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "ref_ce_loss": 0.09852960705757141, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "loss": 0.3980017900466919, + "step": 11960 + }, + { + "ce_loss": 0.1523795872926712, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "distill_loss": 0.1418992578983307, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "ref_ce_loss": 0.1035848930478096, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "loss": 0.45328015089035034, + "step": 11960 + }, + { + "ce_loss": 0.14866439998149872, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "distill_loss": 0.24166658520698547, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "ref_ce_loss": 0.06284800916910172, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "loss": 1.29494309425354, + "step": 11960 + }, + { + "ce_loss": 0.22440198063850403, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "distill_loss": 0.3025294244289398, + "epoch": 3.989326217478319, + "step": 11960 + }, + { + "epoch": 3.989326217478319, + "ref_ce_loss": 0.1537446528673172, + "step": 11960 + }, + { + "epoch": 3.9926617745163444, + "loss": 0.5301, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "grad_norm": 3.436962604522705, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "learning_rate": 0.00015773441503765537, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "loss": 0.38053038716316223, + "step": 11970 + }, + { + "ce_loss": 0.12679526209831238, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "distill_loss": 0.18604570627212524, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "ref_ce_loss": 0.06746406108140945, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "loss": 0.33811435103416443, + "step": 11970 + }, + { + "ce_loss": 0.06887925416231155, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "distill_loss": 0.14557978510856628, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "ref_ce_loss": 0.06431765854358673, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "loss": 0.43382972478866577, + "step": 11970 + }, + { + "ce_loss": 0.14090314507484436, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "distill_loss": 0.20701713860034943, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "ref_ce_loss": 0.05977332219481468, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "loss": 0.5020667314529419, + "step": 11970 + }, + { + "ce_loss": 0.11767388880252838, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "distill_loss": 0.2077358216047287, + "epoch": 3.9926617745163444, + "step": 11970 + }, + { + "epoch": 3.9926617745163444, + "ref_ce_loss": 0.11229929327964783, + "step": 11970 + }, + { + "epoch": 3.9959973315543698, + "loss": 0.5052, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "grad_norm": 3.4483630657196045, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "learning_rate": 0.00015753211607851114, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "loss": 0.8230859637260437, + "step": 11980 + }, + { + "ce_loss": 0.1882457435131073, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "distill_loss": 0.17528904974460602, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "ref_ce_loss": 0.09798739105463028, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "loss": 0.48872292041778564, + "step": 11980 + }, + { + "ce_loss": 0.12391944974660873, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "distill_loss": 0.19896352291107178, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "ref_ce_loss": 0.11606057733297348, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "loss": 0.42669156193733215, + "step": 11980 + }, + { + "ce_loss": 0.07362711429595947, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "distill_loss": 0.1800188571214676, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "ref_ce_loss": 0.11211638152599335, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "loss": 0.9327055215835571, + "step": 11980 + }, + { + "ce_loss": 0.15583303570747375, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "distill_loss": 0.20757633447647095, + "epoch": 3.9959973315543698, + "step": 11980 + }, + { + "epoch": 3.9959973315543698, + "ref_ce_loss": 0.12007487565279007, + "step": 11980 + }, + { + "epoch": 3.999332888592395, + "loss": 0.494, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "grad_norm": 1.737951397895813, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "learning_rate": 0.00015732980338375836, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "loss": 0.37717702984809875, + "step": 11990 + }, + { + "ce_loss": 0.12793800234794617, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "distill_loss": 0.17517952620983124, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "ref_ce_loss": 0.07389137148857117, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "loss": 0.23578205704689026, + "step": 11990 + }, + { + "ce_loss": 0.04605966433882713, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "distill_loss": 0.12195150554180145, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "ref_ce_loss": 0.06749168783426285, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "loss": 0.45324552059173584, + "step": 11990 + }, + { + "ce_loss": 0.16480252146720886, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "distill_loss": 0.16457819938659668, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "ref_ce_loss": 0.12353596836328506, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "loss": 0.3933228552341461, + "step": 11990 + }, + { + "ce_loss": 0.1270236372947693, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "distill_loss": 0.17129284143447876, + "epoch": 3.999332888592395, + "step": 11990 + }, + { + "epoch": 3.999332888592395, + "ref_ce_loss": 0.09473787248134613, + "step": 11990 + }, + { + "epoch": 4.0026684456304205, + "loss": 0.5063, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "grad_norm": 3.112962245941162, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "learning_rate": 0.00015712747732233556, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "loss": 0.37287402153015137, + "step": 12000 + }, + { + "ce_loss": 0.06412719190120697, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "distill_loss": 0.12839214503765106, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "ref_ce_loss": 0.09659893810749054, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "loss": 0.5252148509025574, + "step": 12000 + }, + { + "ce_loss": 0.125128373503685, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "distill_loss": 0.15444694459438324, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "ref_ce_loss": 0.09184499830007553, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "loss": 0.4332132637500763, + "step": 12000 + }, + { + "ce_loss": 0.10666043311357498, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "distill_loss": 0.13837860524654388, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "ref_ce_loss": 0.12885423004627228, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "loss": 0.9290437698364258, + "step": 12000 + }, + { + "ce_loss": 0.15852229297161102, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "distill_loss": 0.17924055457115173, + "epoch": 4.0026684456304205, + "step": 12000 + }, + { + "epoch": 4.0026684456304205, + "ref_ce_loss": 0.1165415495634079, + "step": 12000 + }, + { + "epoch": 4.006004002668446, + "loss": 0.4461, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "grad_norm": 2.8395586013793945, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "learning_rate": 0.00015692513826320571, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "loss": 0.3618427813053131, + "step": 12010 + }, + { + "ce_loss": 0.07987383753061295, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "distill_loss": 0.14220160245895386, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "ref_ce_loss": 0.0801934227347374, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "loss": 0.5198785066604614, + "step": 12010 + }, + { + "ce_loss": 0.1803334504365921, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "distill_loss": 0.198713481426239, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "ref_ce_loss": 0.10547371953725815, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "loss": 0.2746797502040863, + "step": 12010 + }, + { + "ce_loss": 0.0662359818816185, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "distill_loss": 0.09645555913448334, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "ref_ce_loss": 0.11164869368076324, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "loss": 0.5860291719436646, + "step": 12010 + }, + { + "ce_loss": 0.09020759165287018, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "distill_loss": 0.15296566486358643, + "epoch": 4.006004002668446, + "step": 12010 + }, + { + "epoch": 4.006004002668446, + "ref_ce_loss": 0.08592826873064041, + "step": 12010 + }, + { + "epoch": 4.009339559706471, + "loss": 0.4357, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "grad_norm": 2.628180503845215, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "learning_rate": 0.00015672278657535537, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "loss": 0.36614707112312317, + "step": 12020 + }, + { + "ce_loss": 0.07927390933036804, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "distill_loss": 0.15328463912010193, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "ref_ce_loss": 0.0527513325214386, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "loss": 0.34300854802131653, + "step": 12020 + }, + { + "ce_loss": 0.09013242274522781, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "distill_loss": 0.10978487133979797, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "ref_ce_loss": 0.12032110244035721, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "loss": 0.6139493584632874, + "step": 12020 + }, + { + "ce_loss": 0.09104227274656296, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "distill_loss": 0.24132594466209412, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "ref_ce_loss": 0.12532581388950348, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "loss": 0.33067432045936584, + "step": 12020 + }, + { + "ce_loss": 0.10223788768053055, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "distill_loss": 0.15111784636974335, + "epoch": 4.009339559706471, + "step": 12020 + }, + { + "epoch": 4.009339559706471, + "ref_ce_loss": 0.06201582029461861, + "step": 12020 + }, + { + "epoch": 4.0126751167444965, + "loss": 0.4694, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "grad_norm": 4.58229923248291, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "learning_rate": 0.00015652042262779425, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "loss": 0.34944185614585876, + "step": 12030 + }, + { + "ce_loss": 0.07081367820501328, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "distill_loss": 0.11567500233650208, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "ref_ce_loss": 0.11731644719839096, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "loss": 0.25561827421188354, + "step": 12030 + }, + { + "ce_loss": 0.037178147584199905, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "distill_loss": 0.09498228132724762, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "ref_ce_loss": 0.07046579569578171, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "loss": 0.33924317359924316, + "step": 12030 + }, + { + "ce_loss": 0.048774536699056625, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "distill_loss": 0.13083665072917938, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "ref_ce_loss": 0.06417500972747803, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "loss": 0.21740387380123138, + "step": 12030 + }, + { + "ce_loss": 0.024787340313196182, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "distill_loss": 0.1325160712003708, + "epoch": 4.0126751167444965, + "step": 12030 + }, + { + "epoch": 4.0126751167444965, + "ref_ce_loss": 0.059933774173259735, + "step": 12030 + }, + { + "epoch": 4.016010673782522, + "loss": 0.4299, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "grad_norm": 2.332430362701416, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "learning_rate": 0.0001563180467895544, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "loss": 0.31575217843055725, + "step": 12040 + }, + { + "ce_loss": 0.05037858337163925, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "distill_loss": 0.13738572597503662, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "ref_ce_loss": 0.05846783518791199, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "loss": 0.3863769769668579, + "step": 12040 + }, + { + "ce_loss": 0.12451977282762527, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "distill_loss": 0.08762554079294205, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "ref_ce_loss": 0.13553641736507416, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "loss": 0.38889557123184204, + "step": 12040 + }, + { + "ce_loss": 0.07112105190753937, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "distill_loss": 0.15990790724754333, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "ref_ce_loss": 0.09904682636260986, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "loss": 0.458415687084198, + "step": 12040 + }, + { + "ce_loss": 0.04768303409218788, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "distill_loss": 0.12121204286813736, + "epoch": 4.016010673782522, + "step": 12040 + }, + { + "epoch": 4.016010673782522, + "ref_ce_loss": 0.0890619307756424, + "step": 12040 + }, + { + "epoch": 4.019346230820547, + "loss": 0.4102, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "grad_norm": 2.3165054321289062, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "learning_rate": 0.00015611565942968942, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "loss": 0.2301270216703415, + "step": 12050 + }, + { + "ce_loss": 0.07038333266973495, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "distill_loss": 0.09666424989700317, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "ref_ce_loss": 0.06273900717496872, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "loss": 0.3673703670501709, + "step": 12050 + }, + { + "ce_loss": 0.10099012404680252, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "distill_loss": 0.1266748458147049, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "ref_ce_loss": 0.038833919912576675, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "loss": 0.47880423069000244, + "step": 12050 + }, + { + "ce_loss": 0.1467541754245758, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "distill_loss": 0.1850563883781433, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "ref_ce_loss": 0.10418304055929184, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "loss": 0.37483644485473633, + "step": 12050 + }, + { + "ce_loss": 0.07514076679944992, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "distill_loss": 0.12129350006580353, + "epoch": 4.019346230820547, + "step": 12050 + }, + { + "epoch": 4.019346230820547, + "ref_ce_loss": 0.10785885155200958, + "step": 12050 + }, + { + "epoch": 4.0226817878585726, + "loss": 0.4115, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "grad_norm": 2.2294394969940186, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "learning_rate": 0.00015591326091727415, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "loss": 0.4823136627674103, + "step": 12060 + }, + { + "ce_loss": 0.1290162205696106, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "distill_loss": 0.20154951512813568, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "ref_ce_loss": 0.06897612661123276, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "loss": 0.35938891768455505, + "step": 12060 + }, + { + "ce_loss": 0.05964622274041176, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "distill_loss": 0.15778091549873352, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "ref_ce_loss": 0.10114988684654236, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "loss": 0.5172111392021179, + "step": 12060 + }, + { + "ce_loss": 0.1536588817834854, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "distill_loss": 0.16821041703224182, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "ref_ce_loss": 0.09951279312372208, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "loss": 0.38765984773635864, + "step": 12060 + }, + { + "ce_loss": 0.04113280028104782, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "distill_loss": 0.10768014937639236, + "epoch": 4.0226817878585726, + "step": 12060 + }, + { + "epoch": 4.0226817878585726, + "ref_ce_loss": 0.0842534527182579, + "step": 12060 + }, + { + "epoch": 4.026017344896598, + "loss": 0.4356, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "grad_norm": 5.903532981872559, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "learning_rate": 0.00015571085162140348, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "loss": 0.37335243821144104, + "step": 12070 + }, + { + "ce_loss": 0.09252926707267761, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "distill_loss": 0.14429230988025665, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "ref_ce_loss": 0.06657299399375916, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "loss": 0.37593209743499756, + "step": 12070 + }, + { + "ce_loss": 0.10314659774303436, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "distill_loss": 0.14421296119689941, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "ref_ce_loss": 0.0858980119228363, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "loss": 0.30769476294517517, + "step": 12070 + }, + { + "ce_loss": 0.07112128287553787, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "distill_loss": 0.1462993174791336, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "ref_ce_loss": 0.0900954082608223, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "loss": 0.5134388208389282, + "step": 12070 + }, + { + "ce_loss": 0.0642455592751503, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "distill_loss": 0.13371606171131134, + "epoch": 4.026017344896598, + "step": 12070 + }, + { + "epoch": 4.026017344896598, + "ref_ce_loss": 0.08747991174459457, + "step": 12070 + }, + { + "epoch": 4.029352901934623, + "loss": 0.4413, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "grad_norm": 2.3316900730133057, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "learning_rate": 0.0001555084319111922, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "loss": 0.7505779266357422, + "step": 12080 + }, + { + "ce_loss": 0.14793424308300018, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "distill_loss": 0.22277532517910004, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "ref_ce_loss": 0.12870121002197266, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "loss": 0.8267667889595032, + "step": 12080 + }, + { + "ce_loss": 0.10344298183917999, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "distill_loss": 0.3623758554458618, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "ref_ce_loss": 0.11685804277658463, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "loss": 0.9109047651290894, + "step": 12080 + }, + { + "ce_loss": 0.10552089661359787, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "distill_loss": 0.19695395231246948, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "ref_ce_loss": 0.09001109004020691, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "loss": 0.38273149728775024, + "step": 12080 + }, + { + "ce_loss": 0.04735790193080902, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "distill_loss": 0.11591293662786484, + "epoch": 4.029352901934623, + "step": 12080 + }, + { + "epoch": 4.029352901934623, + "ref_ce_loss": 0.054501067847013474, + "step": 12080 + }, + { + "epoch": 4.032688458972649, + "loss": 0.4852, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "grad_norm": 3.8199822902679443, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "learning_rate": 0.00015530600215577406, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "loss": 0.45621997117996216, + "step": 12090 + }, + { + "ce_loss": 0.11262834072113037, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "distill_loss": 0.15463413298130035, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "ref_ce_loss": 0.07563820481300354, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "loss": 0.3850533962249756, + "step": 12090 + }, + { + "ce_loss": 0.08699747920036316, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "distill_loss": 0.14912444353103638, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "ref_ce_loss": 0.08690159022808075, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "loss": 0.39265933632850647, + "step": 12090 + }, + { + "ce_loss": 0.10053718090057373, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "distill_loss": 0.20805439352989197, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "ref_ce_loss": 0.08398868143558502, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "loss": 0.8067128658294678, + "step": 12090 + }, + { + "ce_loss": 0.11772124469280243, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "distill_loss": 0.21782295405864716, + "epoch": 4.032688458972649, + "step": 12090 + }, + { + "epoch": 4.032688458972649, + "ref_ce_loss": 0.12272774428129196, + "step": 12090 + }, + { + "epoch": 4.036024016010674, + "loss": 0.4555, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "grad_norm": 4.044421195983887, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "learning_rate": 0.00015510356272430104, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "loss": 0.4507158100605011, + "step": 12100 + }, + { + "ce_loss": 0.15286117792129517, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "distill_loss": 0.16192707419395447, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "ref_ce_loss": 0.135764941573143, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "loss": 0.46531906723976135, + "step": 12100 + }, + { + "ce_loss": 0.09542107582092285, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "distill_loss": 0.1124882772564888, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "ref_ce_loss": 0.10861322283744812, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "loss": 0.253815233707428, + "step": 12100 + }, + { + "ce_loss": 0.06234263256192207, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "distill_loss": 0.10280825197696686, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "ref_ce_loss": 0.06219073012471199, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "loss": 0.3336341977119446, + "step": 12100 + }, + { + "ce_loss": 0.06819576770067215, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "distill_loss": 0.11720435321331024, + "epoch": 4.036024016010674, + "step": 12100 + }, + { + "epoch": 4.036024016010674, + "ref_ce_loss": 0.11607708036899567, + "step": 12100 + }, + { + "epoch": 4.039359573048699, + "loss": 0.4412, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "grad_norm": 1.9481059312820435, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "learning_rate": 0.00015490111398594274, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "loss": 0.29373347759246826, + "step": 12110 + }, + { + "ce_loss": 0.04972933977842331, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "distill_loss": 0.13565079867839813, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "ref_ce_loss": 0.06544889509677887, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "loss": 0.3456859588623047, + "step": 12110 + }, + { + "ce_loss": 0.08374223858118057, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "distill_loss": 0.12544633448123932, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "ref_ce_loss": 0.05935639142990112, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "loss": 0.3637619614601135, + "step": 12110 + }, + { + "ce_loss": 0.113809734582901, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "distill_loss": 0.14481617510318756, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "ref_ce_loss": 0.08502238243818283, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "loss": 0.4461738169193268, + "step": 12110 + }, + { + "ce_loss": 0.1227860376238823, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "distill_loss": 0.21389171481132507, + "epoch": 4.039359573048699, + "step": 12110 + }, + { + "epoch": 4.039359573048699, + "ref_ce_loss": 0.10931068658828735, + "step": 12110 + }, + { + "epoch": 4.042695130086725, + "loss": 0.4501, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "grad_norm": 2.8527162075042725, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "learning_rate": 0.0001546986563098859, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "loss": 0.3520258069038391, + "step": 12120 + }, + { + "ce_loss": 0.0780138224363327, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "distill_loss": 0.14417199790477753, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "ref_ce_loss": 0.09585023671388626, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "loss": 0.5828714370727539, + "step": 12120 + }, + { + "ce_loss": 0.09808569401502609, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "distill_loss": 0.13045603036880493, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "ref_ce_loss": 0.07338562607765198, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "loss": 0.2788516581058502, + "step": 12120 + }, + { + "ce_loss": 0.06606944650411606, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "distill_loss": 0.10744508355855942, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "ref_ce_loss": 0.06514713913202286, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "loss": 0.30643728375434875, + "step": 12120 + }, + { + "ce_loss": 0.08950202167034149, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "distill_loss": 0.14668361842632294, + "epoch": 4.042695130086725, + "step": 12120 + }, + { + "epoch": 4.042695130086725, + "ref_ce_loss": 0.0699087455868721, + "step": 12120 + }, + { + "epoch": 4.04603068712475, + "loss": 0.4806, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "grad_norm": 2.6800968647003174, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "learning_rate": 0.00015449619006533343, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "loss": 0.49065351486206055, + "step": 12130 + }, + { + "ce_loss": 0.06231939420104027, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "distill_loss": 0.22406406700611115, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "ref_ce_loss": 0.07999047636985779, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "loss": 0.45048731565475464, + "step": 12130 + }, + { + "ce_loss": 0.09844529628753662, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "distill_loss": 0.14128665626049042, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "ref_ce_loss": 0.12517352402210236, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "loss": 0.4383133053779602, + "step": 12130 + }, + { + "ce_loss": 0.12598662078380585, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "distill_loss": 0.1633603870868683, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "ref_ce_loss": 0.09888887405395508, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "loss": 0.24241933226585388, + "step": 12130 + }, + { + "ce_loss": 0.04206832870841026, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "distill_loss": 0.14933034777641296, + "epoch": 4.04603068712475, + "step": 12130 + }, + { + "epoch": 4.04603068712475, + "ref_ce_loss": 0.050794150680303574, + "step": 12130 + }, + { + "epoch": 4.049366244162775, + "loss": 0.4709, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "grad_norm": 2.5225110054016113, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "learning_rate": 0.00015429371562150385, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "loss": 0.5925564765930176, + "step": 12140 + }, + { + "ce_loss": 0.05308568477630615, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "distill_loss": 0.15124325454235077, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "ref_ce_loss": 0.07154041528701782, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "loss": 0.3111650347709656, + "step": 12140 + }, + { + "ce_loss": 0.07286947965621948, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "distill_loss": 0.12985488772392273, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "ref_ce_loss": 0.08120644092559814, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "loss": 0.5160831212997437, + "step": 12140 + }, + { + "ce_loss": 0.08501293510198593, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "distill_loss": 0.13488444685935974, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "ref_ce_loss": 0.08812274038791656, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "loss": 0.2512379288673401, + "step": 12140 + }, + { + "ce_loss": 0.07532287389039993, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "distill_loss": 0.10057376325130463, + "epoch": 4.049366244162775, + "step": 12140 + }, + { + "epoch": 4.049366244162775, + "ref_ce_loss": 0.07492376863956451, + "step": 12140 + }, + { + "epoch": 4.052701801200801, + "loss": 0.4284, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "grad_norm": 4.018413066864014, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "learning_rate": 0.00015409123334763077, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "loss": 0.47143644094467163, + "step": 12150 + }, + { + "ce_loss": 0.11935675889253616, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "distill_loss": 0.19259300827980042, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "ref_ce_loss": 0.1302216798067093, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "loss": 0.3792656660079956, + "step": 12150 + }, + { + "ce_loss": 0.11778055131435394, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "distill_loss": 0.17504708468914032, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "ref_ce_loss": 0.08613471686840057, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "loss": 0.5767985582351685, + "step": 12150 + }, + { + "ce_loss": 0.16085843741893768, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "distill_loss": 0.24712547659873962, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "ref_ce_loss": 0.12454459071159363, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "loss": 0.4131409823894501, + "step": 12150 + }, + { + "ce_loss": 0.053598470985889435, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "distill_loss": 0.16270749270915985, + "epoch": 4.052701801200801, + "step": 12150 + }, + { + "epoch": 4.052701801200801, + "ref_ce_loss": 0.08859755843877792, + "step": 12150 + }, + { + "epoch": 4.056037358238826, + "loss": 0.4947, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "grad_norm": 3.498987913131714, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "learning_rate": 0.00015388874361296184, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "loss": 0.3833063542842865, + "step": 12160 + }, + { + "ce_loss": 0.08747068792581558, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "distill_loss": 0.14331986010074615, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "ref_ce_loss": 0.062439221888780594, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "loss": 0.6706572771072388, + "step": 12160 + }, + { + "ce_loss": 0.17328523099422455, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "distill_loss": 0.22888147830963135, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "ref_ce_loss": 0.0902508795261383, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "loss": 0.6042168140411377, + "step": 12160 + }, + { + "ce_loss": 0.12251890450716019, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "distill_loss": 0.22527821362018585, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "ref_ce_loss": 0.10940185189247131, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "loss": 0.35750412940979004, + "step": 12160 + }, + { + "ce_loss": 0.04393932968378067, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "distill_loss": 0.13152629137039185, + "epoch": 4.056037358238826, + "step": 12160 + }, + { + "epoch": 4.056037358238826, + "ref_ce_loss": 0.09057697653770447, + "step": 12160 + }, + { + "epoch": 4.059372915276851, + "loss": 0.4463, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "grad_norm": 3.839312791824341, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "learning_rate": 0.00015368624678675858, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "loss": 0.2831639051437378, + "step": 12170 + }, + { + "ce_loss": 0.06703417003154755, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "distill_loss": 0.1631172150373459, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "ref_ce_loss": 0.05295206606388092, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "loss": 0.5331304669380188, + "step": 12170 + }, + { + "ce_loss": 0.107203908264637, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "distill_loss": 0.13781806826591492, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "ref_ce_loss": 0.07520002871751785, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "loss": 0.3221982717514038, + "step": 12170 + }, + { + "ce_loss": 0.11160756647586823, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "distill_loss": 0.12180545181035995, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "ref_ce_loss": 0.06871822476387024, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "loss": 0.8085801601409912, + "step": 12170 + }, + { + "ce_loss": 0.1456156075000763, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "distill_loss": 0.19259947538375854, + "epoch": 4.059372915276851, + "step": 12170 + }, + { + "epoch": 4.059372915276851, + "ref_ce_loss": 0.0803808867931366, + "step": 12170 + }, + { + "epoch": 4.062708472314877, + "loss": 0.4678, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "grad_norm": 3.4009456634521484, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "learning_rate": 0.0001534837432382953, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "loss": 0.4208252727985382, + "step": 12180 + }, + { + "ce_loss": 0.0713610127568245, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "distill_loss": 0.1717347800731659, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "ref_ce_loss": 0.06079830601811409, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "loss": 0.5087335109710693, + "step": 12180 + }, + { + "ce_loss": 0.14097148180007935, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "distill_loss": 0.20097658038139343, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "ref_ce_loss": 0.08624764531850815, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "loss": 0.3848879337310791, + "step": 12180 + }, + { + "ce_loss": 0.10179894417524338, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "distill_loss": 0.19365674257278442, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "ref_ce_loss": 0.08927271515130997, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "loss": 0.3143869936466217, + "step": 12180 + }, + { + "ce_loss": 0.05073242262005806, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "distill_loss": 0.1419139802455902, + "epoch": 4.062708472314877, + "step": 12180 + }, + { + "epoch": 4.062708472314877, + "ref_ce_loss": 0.06367547810077667, + "step": 12180 + }, + { + "epoch": 4.066044029352902, + "loss": 0.485, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "grad_norm": 3.052934169769287, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "learning_rate": 0.00015328123333685855, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "loss": 0.43036988377571106, + "step": 12190 + }, + { + "ce_loss": 0.11047770082950592, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "distill_loss": 0.16661612689495087, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "ref_ce_loss": 0.07643501460552216, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "loss": 0.28986313939094543, + "step": 12190 + }, + { + "ce_loss": 0.09734617173671722, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "distill_loss": 0.10917098075151443, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "ref_ce_loss": 0.060155268758535385, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "loss": 0.35381343960762024, + "step": 12190 + }, + { + "ce_loss": 0.14211811125278473, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "distill_loss": 0.11142193526029587, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "ref_ce_loss": 0.06509396433830261, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "loss": 0.5622801780700684, + "step": 12190 + }, + { + "ce_loss": 0.12198896706104279, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "distill_loss": 0.19638624787330627, + "epoch": 4.066044029352902, + "step": 12190 + }, + { + "epoch": 4.066044029352902, + "ref_ce_loss": 0.09142731130123138, + "step": 12190 + }, + { + "epoch": 4.0693795863909275, + "loss": 0.4595, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "grad_norm": 3.138794183731079, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "learning_rate": 0.00015307871745174655, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "loss": 0.23779502511024475, + "step": 12200 + }, + { + "ce_loss": 0.034942738711833954, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "distill_loss": 0.09184229373931885, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "ref_ce_loss": 0.07803148776292801, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "loss": 0.9437388181686401, + "step": 12200 + }, + { + "ce_loss": 0.16672064363956451, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "distill_loss": 0.2688308656215668, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "ref_ce_loss": 0.0933331549167633, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "loss": 0.6048306822776794, + "step": 12200 + }, + { + "ce_loss": 0.10321357846260071, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "distill_loss": 0.24434088170528412, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "ref_ce_loss": 0.09095852822065353, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "loss": 0.5055559277534485, + "step": 12200 + }, + { + "ce_loss": 0.134865403175354, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "distill_loss": 0.16672514379024506, + "epoch": 4.0693795863909275, + "step": 12200 + }, + { + "epoch": 4.0693795863909275, + "ref_ce_loss": 0.08413047343492508, + "step": 12200 + }, + { + "epoch": 4.072715143428953, + "loss": 0.4824, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "grad_norm": 5.428567409515381, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "learning_rate": 0.00015287619595226839, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "loss": 0.5914174914360046, + "step": 12210 + }, + { + "ce_loss": 0.10178596526384354, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "distill_loss": 0.23800191283226013, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "ref_ce_loss": 0.10165928304195404, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "loss": 0.32598963379859924, + "step": 12210 + }, + { + "ce_loss": 0.07633938640356064, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "distill_loss": 0.1258414089679718, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "ref_ce_loss": 0.05301518365740776, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "loss": 0.36426302790641785, + "step": 12210 + }, + { + "ce_loss": 0.11082565039396286, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "distill_loss": 0.12851569056510925, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "ref_ce_loss": 0.10189623385667801, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "loss": 0.6586376428604126, + "step": 12210 + }, + { + "ce_loss": 0.3017585575580597, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "distill_loss": 0.2281065434217453, + "epoch": 4.072715143428953, + "step": 12210 + }, + { + "epoch": 4.072715143428953, + "ref_ce_loss": 0.12855705618858337, + "step": 12210 + }, + { + "epoch": 4.076050700466978, + "loss": 0.441, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "grad_norm": 2.7300357818603516, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "learning_rate": 0.00015267366920774337, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "loss": 0.5121831297874451, + "step": 12220 + }, + { + "ce_loss": 0.14911715686321259, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "distill_loss": 0.15121105313301086, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "ref_ce_loss": 0.08566410839557648, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "loss": 0.6446946263313293, + "step": 12220 + }, + { + "ce_loss": 0.08306907117366791, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "distill_loss": 0.17455963790416718, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "ref_ce_loss": 0.10251959413290024, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "loss": 0.3556770086288452, + "step": 12220 + }, + { + "ce_loss": 0.10885165631771088, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "distill_loss": 0.15706342458724976, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "ref_ce_loss": 0.08962170779705048, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "loss": 0.5599108338356018, + "step": 12220 + }, + { + "ce_loss": 0.15744344890117645, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "distill_loss": 0.21136343479156494, + "epoch": 4.076050700466978, + "step": 12220 + }, + { + "epoch": 4.076050700466978, + "ref_ce_loss": 0.13838784396648407, + "step": 12220 + }, + { + "epoch": 4.0793862575050035, + "loss": 0.4326, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "grad_norm": 2.221538782119751, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "learning_rate": 0.0001524711375875004, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "loss": 0.5925437211990356, + "step": 12230 + }, + { + "ce_loss": 0.16379602253437042, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "distill_loss": 0.1683337390422821, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "ref_ce_loss": 0.1362374871969223, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "loss": 0.3094097077846527, + "step": 12230 + }, + { + "ce_loss": 0.07063864916563034, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "distill_loss": 0.09304803609848022, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "ref_ce_loss": 0.09349749237298965, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "loss": 0.35739865899086, + "step": 12230 + }, + { + "ce_loss": 0.07744188606739044, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "distill_loss": 0.15637515485286713, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "ref_ce_loss": 0.08380445837974548, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "loss": 0.30349835753440857, + "step": 12230 + }, + { + "ce_loss": 0.06610994786024094, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "distill_loss": 0.13098305463790894, + "epoch": 4.0793862575050035, + "step": 12230 + }, + { + "epoch": 4.0793862575050035, + "ref_ce_loss": 0.07835172116756439, + "step": 12230 + }, + { + "epoch": 4.082721814543029, + "loss": 0.4087, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "grad_norm": 3.177318572998047, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "learning_rate": 0.00015226860146087725, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "loss": 0.4792167842388153, + "step": 12240 + }, + { + "ce_loss": 0.12271936982870102, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "distill_loss": 0.26747363805770874, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "ref_ce_loss": 0.08874979615211487, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "loss": 0.3786783218383789, + "step": 12240 + }, + { + "ce_loss": 0.10062919557094574, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "distill_loss": 0.11116902530193329, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "ref_ce_loss": 0.13712024688720703, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "loss": 0.6380170583724976, + "step": 12240 + }, + { + "ce_loss": 0.08112715184688568, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "distill_loss": 0.0964871421456337, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "ref_ce_loss": 0.041808392852544785, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "loss": 0.2703827917575836, + "step": 12240 + }, + { + "ce_loss": 0.029660837724804878, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "distill_loss": 0.0692979022860527, + "epoch": 4.082721814543029, + "step": 12240 + }, + { + "epoch": 4.082721814543029, + "ref_ce_loss": 0.07503220438957214, + "step": 12240 + }, + { + "epoch": 4.086057371581054, + "loss": 0.4596, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "grad_norm": 3.3609211444854736, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "learning_rate": 0.00015206606119721986, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "loss": 0.29551267623901367, + "step": 12250 + }, + { + "ce_loss": 0.04871445521712303, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "distill_loss": 0.15821117162704468, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "ref_ce_loss": 0.08842559158802032, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "loss": 0.38177791237831116, + "step": 12250 + }, + { + "ce_loss": 0.07803545147180557, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "distill_loss": 0.14289258420467377, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "ref_ce_loss": 0.10587479919195175, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "loss": 0.28975555300712585, + "step": 12250 + }, + { + "ce_loss": 0.054523248225450516, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "distill_loss": 0.14741668105125427, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "ref_ce_loss": 0.08770716190338135, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "loss": 0.4934762120246887, + "step": 12250 + }, + { + "ce_loss": 0.1420019418001175, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "distill_loss": 0.2047957330942154, + "epoch": 4.086057371581054, + "step": 12250 + }, + { + "epoch": 4.086057371581054, + "ref_ce_loss": 0.10681845992803574, + "step": 12250 + }, + { + "epoch": 4.0893929286190795, + "loss": 0.4271, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "grad_norm": 3.676234245300293, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "learning_rate": 0.00015186351716588192, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "loss": 0.5635673403739929, + "step": 12260 + }, + { + "ce_loss": 0.10926533490419388, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "distill_loss": 0.18934011459350586, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "ref_ce_loss": 0.06814757734537125, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "loss": 0.6337475776672363, + "step": 12260 + }, + { + "ce_loss": 0.08871379494667053, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "distill_loss": 0.15814882516860962, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "ref_ce_loss": 0.1226770430803299, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "loss": 0.29514455795288086, + "step": 12260 + }, + { + "ce_loss": 0.06737024337053299, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "distill_loss": 0.1109817624092102, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "ref_ce_loss": 0.0816814973950386, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "loss": 0.3731231093406677, + "step": 12260 + }, + { + "ce_loss": 0.1104077622294426, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "distill_loss": 0.15331417322158813, + "epoch": 4.0893929286190795, + "step": 12260 + }, + { + "epoch": 4.0893929286190795, + "ref_ce_loss": 0.0772353783249855, + "step": 12260 + }, + { + "epoch": 4.092728485657105, + "loss": 0.4239, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "grad_norm": 2.3397421836853027, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "learning_rate": 0.00015166096973622377, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "loss": 0.35443541407585144, + "step": 12270 + }, + { + "ce_loss": 0.045290570706129074, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "distill_loss": 0.10662087053060532, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "ref_ce_loss": 0.06537654995918274, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "loss": 0.3716265857219696, + "step": 12270 + }, + { + "ce_loss": 0.05092225968837738, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "distill_loss": 0.11948276311159134, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "ref_ce_loss": 0.09160125255584717, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "loss": 0.22582557797431946, + "step": 12270 + }, + { + "ce_loss": 0.05884668603539467, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "distill_loss": 0.11232615262269974, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "ref_ce_loss": 0.05450451374053955, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "loss": 0.3369407653808594, + "step": 12270 + }, + { + "ce_loss": 0.07732923328876495, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "distill_loss": 0.11193235218524933, + "epoch": 4.092728485657105, + "step": 12270 + }, + { + "epoch": 4.092728485657105, + "ref_ce_loss": 0.0715961679816246, + "step": 12270 + }, + { + "epoch": 4.09606404269513, + "loss": 0.4267, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "grad_norm": 3.901005506515503, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "learning_rate": 0.00015145841927761196, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "loss": 0.33873653411865234, + "step": 12280 + }, + { + "ce_loss": 0.12671172618865967, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "distill_loss": 0.12484362721443176, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "ref_ce_loss": 0.06715315580368042, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "loss": 0.33130505681037903, + "step": 12280 + }, + { + "ce_loss": 0.061360765248537064, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "distill_loss": 0.11928476393222809, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "ref_ce_loss": 0.08986619114875793, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "loss": 0.2806569039821625, + "step": 12280 + }, + { + "ce_loss": 0.0503183975815773, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "distill_loss": 0.148472860455513, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "ref_ce_loss": 0.08172249048948288, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "loss": 0.39750438928604126, + "step": 12280 + }, + { + "ce_loss": 0.12096478044986725, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "distill_loss": 0.13770119845867157, + "epoch": 4.09606404269513, + "step": 12280 + }, + { + "epoch": 4.09606404269513, + "ref_ce_loss": 0.08861135691404343, + "step": 12280 + }, + { + "epoch": 4.099399599733156, + "loss": 0.4581, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "grad_norm": 3.1234652996063232, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "learning_rate": 0.00015125586615941873, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "loss": 0.37716343998908997, + "step": 12290 + }, + { + "ce_loss": 0.10681670159101486, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "distill_loss": 0.1452990025281906, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "ref_ce_loss": 0.10262039303779602, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "loss": 0.5069200992584229, + "step": 12290 + }, + { + "ce_loss": 0.11011409014463425, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "distill_loss": 0.15488030016422272, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "ref_ce_loss": 0.08841795474290848, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "loss": 0.3167344927787781, + "step": 12290 + }, + { + "ce_loss": 0.05991573631763458, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "distill_loss": 0.136915922164917, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "ref_ce_loss": 0.07452834397554398, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "loss": 0.3132794499397278, + "step": 12290 + }, + { + "ce_loss": 0.08132331073284149, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "distill_loss": 0.09339018166065216, + "epoch": 4.099399599733156, + "step": 12290 + }, + { + "epoch": 4.099399599733156, + "ref_ce_loss": 0.09748751670122147, + "step": 12290 + }, + { + "epoch": 4.102735156771181, + "loss": 0.4027, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "grad_norm": 2.9308016300201416, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "learning_rate": 0.00015105331075102103, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "loss": 0.375118613243103, + "step": 12300 + }, + { + "ce_loss": 0.10460587590932846, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "distill_loss": 0.14734624326229095, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "ref_ce_loss": 0.12303036451339722, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "loss": 0.2725878655910492, + "step": 12300 + }, + { + "ce_loss": 0.08057550340890884, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "distill_loss": 0.1144259050488472, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "ref_ce_loss": 0.07711077481508255, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "loss": 0.2181396335363388, + "step": 12300 + }, + { + "ce_loss": 0.05747740715742111, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "distill_loss": 0.09495794028043747, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "ref_ce_loss": 0.06535113602876663, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "loss": 0.30614638328552246, + "step": 12300 + }, + { + "ce_loss": 0.13755322992801666, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "distill_loss": 0.09745433181524277, + "epoch": 4.102735156771181, + "step": 12300 + }, + { + "epoch": 4.102735156771181, + "ref_ce_loss": 0.0710759088397026, + "step": 12300 + }, + { + "epoch": 4.106070713809206, + "loss": 0.379, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "grad_norm": 9.995908737182617, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "learning_rate": 0.0001508507534218, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "loss": 0.28247153759002686, + "step": 12310 + }, + { + "ce_loss": 0.06919530034065247, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "distill_loss": 0.1077154353260994, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "ref_ce_loss": 0.06293756514787674, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "loss": 0.3820110559463501, + "step": 12310 + }, + { + "ce_loss": 0.08053288608789444, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "distill_loss": 0.10802694410085678, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "ref_ce_loss": 0.0862024575471878, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "loss": 0.6087753772735596, + "step": 12310 + }, + { + "ce_loss": 0.117802694439888, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "distill_loss": 0.12614843249320984, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "ref_ce_loss": 0.08279848843812943, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "loss": 0.4322092533111572, + "step": 12310 + }, + { + "ce_loss": 0.13464225828647614, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "distill_loss": 0.11292240023612976, + "epoch": 4.106070713809206, + "step": 12310 + }, + { + "epoch": 4.106070713809206, + "ref_ce_loss": 0.08117159456014633, + "step": 12310 + }, + { + "epoch": 4.109406270847232, + "loss": 0.4053, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "grad_norm": 3.473320245742798, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "learning_rate": 0.00015064819454114033, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "loss": 0.373969167470932, + "step": 12320 + }, + { + "ce_loss": 0.1063445508480072, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "distill_loss": 0.15159161388874054, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "ref_ce_loss": 0.05470012128353119, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "loss": 0.4092685580253601, + "step": 12320 + }, + { + "ce_loss": 0.16364896297454834, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "distill_loss": 0.1414794623851776, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "ref_ce_loss": 0.10386661440134048, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "loss": 0.2099723517894745, + "step": 12320 + }, + { + "ce_loss": 0.024279853329062462, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "distill_loss": 0.07701626420021057, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "ref_ce_loss": 0.05946137383580208, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "loss": 0.6005130410194397, + "step": 12320 + }, + { + "ce_loss": 0.13691778481006622, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "distill_loss": 0.10761863738298416, + "epoch": 4.109406270847232, + "step": 12320 + }, + { + "epoch": 4.109406270847232, + "ref_ce_loss": 0.17544463276863098, + "step": 12320 + }, + { + "epoch": 4.112741827885257, + "loss": 0.4127, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "grad_norm": 1.8041068315505981, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "learning_rate": 0.0001504456344784295, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "loss": 0.44366419315338135, + "step": 12330 + }, + { + "ce_loss": 0.16683217883110046, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "distill_loss": 0.13771888613700867, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "ref_ce_loss": 0.09939558058977127, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "loss": 0.6878728270530701, + "step": 12330 + }, + { + "ce_loss": 0.0950092151761055, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "distill_loss": 0.1335957795381546, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "ref_ce_loss": 0.09792289137840271, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "loss": 0.28441381454467773, + "step": 12330 + }, + { + "ce_loss": 0.1099325641989708, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "distill_loss": 0.10999204218387604, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "ref_ce_loss": 0.06433827430009842, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "loss": 0.29116740822792053, + "step": 12330 + }, + { + "ce_loss": 0.0614168606698513, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "distill_loss": 0.11611483991146088, + "epoch": 4.112741827885257, + "step": 12330 + }, + { + "epoch": 4.112741827885257, + "ref_ce_loss": 0.05571691691875458, + "step": 12330 + }, + { + "epoch": 4.116077384923282, + "loss": 0.4103, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "grad_norm": 3.1782829761505127, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "learning_rate": 0.00015024307360305715, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "loss": 0.4817643165588379, + "step": 12340 + }, + { + "ce_loss": 0.1433020681142807, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "distill_loss": 0.15565498173236847, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "ref_ce_loss": 0.08871158957481384, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "loss": 0.4128016233444214, + "step": 12340 + }, + { + "ce_loss": 0.17788727581501007, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "distill_loss": 0.13444003462791443, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "ref_ce_loss": 0.10033713281154633, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "loss": 0.31887951493263245, + "step": 12340 + }, + { + "ce_loss": 0.03353816643357277, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "distill_loss": 0.13450832664966583, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "ref_ce_loss": 0.10326790064573288, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "loss": 0.248275026679039, + "step": 12340 + }, + { + "ce_loss": 0.07996595650911331, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "distill_loss": 0.09831559658050537, + "epoch": 4.116077384923282, + "step": 12340 + }, + { + "epoch": 4.116077384923282, + "ref_ce_loss": 0.06948510557413101, + "step": 12340 + }, + { + "epoch": 4.119412941961308, + "loss": 0.4763, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "grad_norm": 1.4909628629684448, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "learning_rate": 0.0001500405122844145, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "loss": 0.22745607793331146, + "step": 12350 + }, + { + "ce_loss": 0.062056880444288254, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "distill_loss": 0.09504853934049606, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "ref_ce_loss": 0.049994029104709625, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "loss": 0.44903331995010376, + "step": 12350 + }, + { + "ce_loss": 0.04053228721022606, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "distill_loss": 0.12767958641052246, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "ref_ce_loss": 0.06790989637374878, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "loss": 0.4405551552772522, + "step": 12350 + }, + { + "ce_loss": 0.17078498005867004, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "distill_loss": 0.16165587306022644, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "ref_ce_loss": 0.10795851796865463, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "loss": 0.23271147906780243, + "step": 12350 + }, + { + "ce_loss": 0.07316546142101288, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "distill_loss": 0.11298181116580963, + "epoch": 4.119412941961308, + "step": 12350 + }, + { + "epoch": 4.119412941961308, + "ref_ce_loss": 0.04643390700221062, + "step": 12350 + }, + { + "epoch": 4.122748498999333, + "loss": 0.4043, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "grad_norm": 2.5978572368621826, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "learning_rate": 0.00014983795089189335, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "loss": 0.33180752396583557, + "step": 12360 + }, + { + "ce_loss": 0.06547296792268753, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "distill_loss": 0.12039823830127716, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "ref_ce_loss": 0.10022406280040741, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "loss": 0.5192918181419373, + "step": 12360 + }, + { + "ce_loss": 0.12186074256896973, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "distill_loss": 0.18573838472366333, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "ref_ce_loss": 0.11628958582878113, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "loss": 0.38019275665283203, + "step": 12360 + }, + { + "ce_loss": 0.1042938232421875, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "distill_loss": 0.13982954621315002, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "ref_ce_loss": 0.09626378118991852, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "loss": 0.49562186002731323, + "step": 12360 + }, + { + "ce_loss": 0.1083085834980011, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "distill_loss": 0.09659755975008011, + "epoch": 4.122748498999333, + "step": 12360 + }, + { + "epoch": 4.122748498999333, + "ref_ce_loss": 0.09821917116641998, + "step": 12360 + }, + { + "epoch": 4.126084056037358, + "loss": 0.464, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "grad_norm": 2.7247259616851807, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "learning_rate": 0.0001496353897948859, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "loss": 0.29314735531806946, + "step": 12370 + }, + { + "ce_loss": 0.07004120200872421, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "distill_loss": 0.1270415484905243, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "ref_ce_loss": 0.06225429102778435, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "loss": 0.23097294569015503, + "step": 12370 + }, + { + "ce_loss": 0.04366536810994148, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "distill_loss": 0.1317523717880249, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "ref_ce_loss": 0.055458322167396545, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "loss": 0.5454058647155762, + "step": 12370 + }, + { + "ce_loss": 0.18543444573879242, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "distill_loss": 0.21535417437553406, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "ref_ce_loss": 0.09900393337011337, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "loss": 0.40011027455329895, + "step": 12370 + }, + { + "ce_loss": 0.06823138147592545, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "distill_loss": 0.1889926940202713, + "epoch": 4.126084056037358, + "step": 12370 + }, + { + "epoch": 4.126084056037358, + "ref_ce_loss": 0.10016527026891708, + "step": 12370 + }, + { + "epoch": 4.129419613075384, + "loss": 0.4531, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "grad_norm": 4.133249759674072, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "learning_rate": 0.00014943282936278365, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "loss": 0.3787361979484558, + "step": 12380 + }, + { + "ce_loss": 0.14036217331886292, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "distill_loss": 0.11300624161958694, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "ref_ce_loss": 0.09571686387062073, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "loss": 0.41098782420158386, + "step": 12380 + }, + { + "ce_loss": 0.0697634220123291, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "distill_loss": 0.21002747118473053, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "ref_ce_loss": 0.054851651191711426, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "loss": 0.3206042945384979, + "step": 12380 + }, + { + "ce_loss": 0.09390709549188614, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "distill_loss": 0.14140871167182922, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "ref_ce_loss": 0.08524361997842789, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "loss": 0.3636821508407593, + "step": 12380 + }, + { + "ce_loss": 0.09315887093544006, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "distill_loss": 0.1522243171930313, + "epoch": 4.129419613075384, + "step": 12380 + }, + { + "epoch": 4.129419613075384, + "ref_ce_loss": 0.0804595798254013, + "step": 12380 + }, + { + "epoch": 4.132755170113409, + "loss": 0.473, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "grad_norm": 3.5067243576049805, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "learning_rate": 0.00014923026996497684, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "loss": 0.5671949982643127, + "step": 12390 + }, + { + "ce_loss": 0.1705818623304367, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "distill_loss": 0.25043511390686035, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "ref_ce_loss": 0.10165385156869888, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "loss": 0.5505607724189758, + "step": 12390 + }, + { + "ce_loss": 0.15338660776615143, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "distill_loss": 0.25098103284835815, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "ref_ce_loss": 0.07809601724147797, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "loss": 0.2750885784626007, + "step": 12390 + }, + { + "ce_loss": 0.04651748389005661, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "distill_loss": 0.13873888552188873, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "ref_ce_loss": 0.04965699091553688, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "loss": 0.46758201718330383, + "step": 12390 + }, + { + "ce_loss": 0.09850476682186127, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "distill_loss": 0.16520605981349945, + "epoch": 4.132755170113409, + "step": 12390 + }, + { + "epoch": 4.132755170113409, + "ref_ce_loss": 0.13813252747058868, + "step": 12390 + }, + { + "epoch": 4.136090727151434, + "loss": 0.43, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "grad_norm": 2.588744640350342, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "learning_rate": 0.00014902771197085403, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "loss": 0.43794015049934387, + "step": 12400 + }, + { + "ce_loss": 0.0985134094953537, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "distill_loss": 0.1885952353477478, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "ref_ce_loss": 0.1172669306397438, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "loss": 0.3357079029083252, + "step": 12400 + }, + { + "ce_loss": 0.08318466693162918, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "distill_loss": 0.12697243690490723, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "ref_ce_loss": 0.0950666069984436, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "loss": 0.2388584166765213, + "step": 12400 + }, + { + "ce_loss": 0.04853922128677368, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "distill_loss": 0.10496456176042557, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "ref_ce_loss": 0.06415881961584091, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "loss": 0.31081292033195496, + "step": 12400 + }, + { + "ce_loss": 0.10069255530834198, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "distill_loss": 0.1103476732969284, + "epoch": 4.136090727151434, + "step": 12400 + }, + { + "epoch": 4.136090727151434, + "ref_ce_loss": 0.07207563519477844, + "step": 12400 + }, + { + "epoch": 4.13942628418946, + "loss": 0.4173, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "grad_norm": 2.5426387786865234, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "learning_rate": 0.00014882515574980108, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "loss": 0.3717375099658966, + "step": 12410 + }, + { + "ce_loss": 0.06511876732110977, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "distill_loss": 0.1163686066865921, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "ref_ce_loss": 0.06961381435394287, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "loss": 0.25051334500312805, + "step": 12410 + }, + { + "ce_loss": 0.05856660380959511, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "distill_loss": 0.08555157482624054, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "ref_ce_loss": 0.06662749499082565, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "loss": 0.3159472644329071, + "step": 12410 + }, + { + "ce_loss": 0.026417195796966553, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "distill_loss": 0.09527157992124557, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "ref_ce_loss": 0.09266111254692078, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "loss": 0.5626707673072815, + "step": 12410 + }, + { + "ce_loss": 0.14858536422252655, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "distill_loss": 0.16457600891590118, + "epoch": 4.13942628418946, + "step": 12410 + }, + { + "epoch": 4.13942628418946, + "ref_ce_loss": 0.09589312970638275, + "step": 12410 + }, + { + "epoch": 4.142761841227485, + "loss": 0.4934, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "grad_norm": 3.4323041439056396, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "learning_rate": 0.00014862260167120052, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "loss": 0.3895532488822937, + "step": 12420 + }, + { + "ce_loss": 0.11411762237548828, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "distill_loss": 0.18560034036636353, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "ref_ce_loss": 0.08894186466932297, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "loss": 0.5213585495948792, + "step": 12420 + }, + { + "ce_loss": 0.06978300213813782, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "distill_loss": 0.2566857933998108, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "ref_ce_loss": 0.10135982930660248, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "loss": 0.5565406084060669, + "step": 12420 + }, + { + "ce_loss": 0.058990854769945145, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "distill_loss": 0.19492602348327637, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "ref_ce_loss": 0.07594392448663712, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "loss": 0.2676648497581482, + "step": 12420 + }, + { + "ce_loss": 0.07127406448125839, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "distill_loss": 0.1313502937555313, + "epoch": 4.142761841227485, + "step": 12420 + }, + { + "epoch": 4.142761841227485, + "ref_ce_loss": 0.06474132835865021, + "step": 12420 + }, + { + "epoch": 4.1460973982655105, + "loss": 0.4454, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "grad_norm": 2.4196767807006836, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "learning_rate": 0.00014842005010443126, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "loss": 0.3624645471572876, + "step": 12430 + }, + { + "ce_loss": 0.09504459798336029, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "distill_loss": 0.09769954532384872, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "ref_ce_loss": 0.09856883436441422, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "loss": 0.2978444993495941, + "step": 12430 + }, + { + "ce_loss": 0.07314880937337875, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "distill_loss": 0.11760088801383972, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "ref_ce_loss": 0.1069493219256401, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "loss": 0.27242863178253174, + "step": 12430 + }, + { + "ce_loss": 0.06576551496982574, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "distill_loss": 0.09319284558296204, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "ref_ce_loss": 0.07609212398529053, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "loss": 0.35750612616539, + "step": 12430 + }, + { + "ce_loss": 0.06778037548065186, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "distill_loss": 0.10166697949171066, + "epoch": 4.1460973982655105, + "step": 12430 + }, + { + "epoch": 4.1460973982655105, + "ref_ce_loss": 0.07472172379493713, + "step": 12430 + }, + { + "epoch": 4.149432955303536, + "loss": 0.3868, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "grad_norm": 2.221003532409668, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "learning_rate": 0.0001482175014188673, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "loss": 0.2614574730396271, + "step": 12440 + }, + { + "ce_loss": 0.031120847910642624, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "distill_loss": 0.08347459137439728, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "ref_ce_loss": 0.06839940696954727, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "loss": 0.27395689487457275, + "step": 12440 + }, + { + "ce_loss": 0.061532970517873764, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "distill_loss": 0.12716756761074066, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "ref_ce_loss": 0.08494030684232712, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "loss": 0.20453256368637085, + "step": 12440 + }, + { + "ce_loss": 0.03099118359386921, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "distill_loss": 0.0905686765909195, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "ref_ce_loss": 0.04775973781943321, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "loss": 0.2593122124671936, + "step": 12440 + }, + { + "ce_loss": 0.027723398059606552, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "distill_loss": 0.12843185663223267, + "epoch": 4.149432955303536, + "step": 12440 + }, + { + "epoch": 4.149432955303536, + "ref_ce_loss": 0.10249225795269012, + "step": 12440 + }, + { + "epoch": 4.152768512341561, + "loss": 0.4185, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "grad_norm": 3.3163435459136963, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "learning_rate": 0.00014801495598387764, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "loss": 0.3037762939929962, + "step": 12450 + }, + { + "ce_loss": 0.08418890833854675, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "distill_loss": 0.13176003098487854, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "ref_ce_loss": 0.08765073120594025, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "loss": 0.3808235228061676, + "step": 12450 + }, + { + "ce_loss": 0.08875785768032074, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "distill_loss": 0.1582205891609192, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "ref_ce_loss": 0.08822675794363022, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "loss": 0.803396999835968, + "step": 12450 + }, + { + "ce_loss": 0.12241890281438828, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "distill_loss": 0.11734962463378906, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "ref_ce_loss": 0.0981023758649826, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "loss": 0.23134736716747284, + "step": 12450 + }, + { + "ce_loss": 0.04157764092087746, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "distill_loss": 0.10362343490123749, + "epoch": 4.152768512341561, + "step": 12450 + }, + { + "epoch": 4.152768512341561, + "ref_ce_loss": 0.08595871925354004, + "step": 12450 + }, + { + "epoch": 4.1561040693795865, + "loss": 0.4531, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "grad_norm": 3.0537631511688232, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "learning_rate": 0.00014781241416882525, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "loss": 0.46858227252960205, + "step": 12460 + }, + { + "ce_loss": 0.17086638510227203, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "distill_loss": 0.1451958417892456, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "ref_ce_loss": 0.07415025681257248, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "loss": 0.4011555016040802, + "step": 12460 + }, + { + "ce_loss": 0.09203765541315079, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "distill_loss": 0.14013737440109253, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "ref_ce_loss": 0.07352907210588455, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "loss": 0.5737204551696777, + "step": 12460 + }, + { + "ce_loss": 0.10794582217931747, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "distill_loss": 0.11963769048452377, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "ref_ce_loss": 0.1095840260386467, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "loss": 0.43205690383911133, + "step": 12460 + }, + { + "ce_loss": 0.15435588359832764, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "distill_loss": 0.1649441421031952, + "epoch": 4.1561040693795865, + "step": 12460 + }, + { + "epoch": 4.1561040693795865, + "ref_ce_loss": 0.07601425796747208, + "step": 12460 + }, + { + "epoch": 4.159439626417612, + "loss": 0.4305, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "grad_norm": 3.452244520187378, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "learning_rate": 0.00014760987634306646, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "loss": 0.65595543384552, + "step": 12470 + }, + { + "ce_loss": 0.1943955272436142, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "distill_loss": 0.13412387669086456, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "ref_ce_loss": 0.09166346490383148, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "loss": 0.47722163796424866, + "step": 12470 + }, + { + "ce_loss": 0.15214447677135468, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "distill_loss": 0.21324175596237183, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "ref_ce_loss": 0.09311570227146149, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "loss": 0.6223658919334412, + "step": 12470 + }, + { + "ce_loss": 0.1359712928533554, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "distill_loss": 0.18622928857803345, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "ref_ce_loss": 0.14117476344108582, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "loss": 0.45449307560920715, + "step": 12470 + }, + { + "ce_loss": 0.1701548993587494, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "distill_loss": 0.1619931310415268, + "epoch": 4.159439626417612, + "step": 12470 + }, + { + "epoch": 4.159439626417612, + "ref_ce_loss": 0.1218772828578949, + "step": 12470 + }, + { + "epoch": 4.162775183455637, + "loss": 0.4336, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "grad_norm": 2.113687753677368, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "learning_rate": 0.0001474073428759504, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "loss": 0.40800178050994873, + "step": 12480 + }, + { + "ce_loss": 0.05434617027640343, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "distill_loss": 0.13565826416015625, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "ref_ce_loss": 0.08128561824560165, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "loss": 0.32694897055625916, + "step": 12480 + }, + { + "ce_loss": 0.042434632778167725, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "distill_loss": 0.10554373264312744, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "ref_ce_loss": 0.08126919716596603, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "loss": 0.1619693487882614, + "step": 12480 + }, + { + "ce_loss": 0.011762240901589394, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "distill_loss": 0.06072559207677841, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "ref_ce_loss": 0.046694181859493256, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "loss": 0.46953803300857544, + "step": 12480 + }, + { + "ce_loss": 0.15216673910617828, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "distill_loss": 0.15188661217689514, + "epoch": 4.162775183455637, + "step": 12480 + }, + { + "epoch": 4.162775183455637, + "ref_ce_loss": 0.09307637065649033, + "step": 12480 + }, + { + "epoch": 4.166110740493663, + "loss": 0.3863, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "grad_norm": 1.9610040187835693, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "learning_rate": 0.0001472048141368182, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "loss": 0.4812275767326355, + "step": 12490 + }, + { + "ce_loss": 0.16773614287376404, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "distill_loss": 0.14370931684970856, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "ref_ce_loss": 0.12923017144203186, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "loss": 0.3812367618083954, + "step": 12490 + }, + { + "ce_loss": 0.10464449226856232, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "distill_loss": 0.11006525903940201, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "ref_ce_loss": 0.09356817603111267, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "loss": 0.32714563608169556, + "step": 12490 + }, + { + "ce_loss": 0.07036662846803665, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "distill_loss": 0.09512940049171448, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "ref_ce_loss": 0.06031448766589165, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "loss": 0.32152891159057617, + "step": 12490 + }, + { + "ce_loss": 0.11197475343942642, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "distill_loss": 0.10625480115413666, + "epoch": 4.166110740493663, + "step": 12490 + }, + { + "epoch": 4.166110740493663, + "ref_ce_loss": 0.07325571775436401, + "step": 12490 + }, + { + "epoch": 4.169446297531688, + "loss": 0.4025, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "grad_norm": 4.8676300048828125, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "learning_rate": 0.0001470022904950024, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "loss": 0.34299716353416443, + "step": 12500 + }, + { + "ce_loss": 0.12802059948444366, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "distill_loss": 0.09531907737255096, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "ref_ce_loss": 0.07253427058458328, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "loss": 0.5606489777565002, + "step": 12500 + }, + { + "ce_loss": 0.16046318411827087, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "distill_loss": 0.2585448622703552, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "ref_ce_loss": 0.14125660061836243, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "loss": 0.2809467017650604, + "step": 12500 + }, + { + "ce_loss": 0.0834980458021164, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "distill_loss": 0.13691724836826324, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "ref_ce_loss": 0.06044924259185791, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "loss": 0.29263341426849365, + "step": 12500 + }, + { + "ce_loss": 0.0704631358385086, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "distill_loss": 0.11643597483634949, + "epoch": 4.169446297531688, + "step": 12500 + }, + { + "epoch": 4.169446297531688, + "ref_ce_loss": 0.10554829239845276, + "step": 12500 + }, + { + "epoch": 4.172781854569713, + "loss": 0.4319, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "grad_norm": 4.185899257659912, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "learning_rate": 0.00014679977231982629, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "loss": 0.30824387073516846, + "step": 12510 + }, + { + "ce_loss": 0.08948945254087448, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "distill_loss": 0.12239973247051239, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "ref_ce_loss": 0.06972790509462357, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "loss": 0.2979793846607208, + "step": 12510 + }, + { + "ce_loss": 0.09574341028928757, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "distill_loss": 0.09483419358730316, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "ref_ce_loss": 0.07258374243974686, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "loss": 0.20826782286167145, + "step": 12510 + }, + { + "ce_loss": 0.05446157604455948, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "distill_loss": 0.08192051947116852, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "ref_ce_loss": 0.07135691493749619, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "loss": 0.43391022086143494, + "step": 12510 + }, + { + "ce_loss": 0.11679257452487946, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "distill_loss": 0.15242500603199005, + "epoch": 4.172781854569713, + "step": 12510 + }, + { + "epoch": 4.172781854569713, + "ref_ce_loss": 0.11640404164791107, + "step": 12510 + }, + { + "epoch": 4.176117411607739, + "loss": 0.3719, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "grad_norm": 1.7702113389968872, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "learning_rate": 0.000146597259980603, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "loss": 0.3008236885070801, + "step": 12520 + }, + { + "ce_loss": 0.03070242889225483, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "distill_loss": 0.1274239718914032, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "ref_ce_loss": 0.08409994840621948, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "loss": 0.3606027662754059, + "step": 12520 + }, + { + "ce_loss": 0.0853457897901535, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "distill_loss": 0.09666283428668976, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "ref_ce_loss": 0.11446385830640793, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "loss": 0.3016287386417389, + "step": 12520 + }, + { + "ce_loss": 0.08930810540914536, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "distill_loss": 0.10128715634346008, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "ref_ce_loss": 0.07626650482416153, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "loss": 0.26100292801856995, + "step": 12520 + }, + { + "ce_loss": 0.08020920306444168, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "distill_loss": 0.10896246135234833, + "epoch": 4.176117411607739, + "step": 12520 + }, + { + "epoch": 4.176117411607739, + "ref_ce_loss": 0.07162782549858093, + "step": 12520 + }, + { + "epoch": 4.179452968645764, + "loss": 0.349, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "grad_norm": 2.14904522895813, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "learning_rate": 0.00014639475384663528, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "loss": 0.2830139100551605, + "step": 12530 + }, + { + "ce_loss": 0.0933670923113823, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "distill_loss": 0.10775136947631836, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "ref_ce_loss": 0.059976205229759216, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "loss": 0.4265592396259308, + "step": 12530 + }, + { + "ce_loss": 0.08390668779611588, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "distill_loss": 0.18817268311977386, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "ref_ce_loss": 0.09286605566740036, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "loss": 0.25192058086395264, + "step": 12530 + }, + { + "ce_loss": 0.07058953493833542, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "distill_loss": 0.10347865521907806, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "ref_ce_loss": 0.07778134942054749, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "loss": 0.25412583351135254, + "step": 12530 + }, + { + "ce_loss": 0.062413640320301056, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "distill_loss": 0.08789204806089401, + "epoch": 4.179452968645764, + "step": 12530 + }, + { + "epoch": 4.179452968645764, + "ref_ce_loss": 0.08234740793704987, + "step": 12530 + }, + { + "epoch": 4.182788525683789, + "loss": 0.3877, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "grad_norm": 2.911775588989258, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "learning_rate": 0.0001461922542872144, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "loss": 0.69565749168396, + "step": 12540 + }, + { + "ce_loss": 0.1180061474442482, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "distill_loss": 0.1855594664812088, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "ref_ce_loss": 0.10253113508224487, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "loss": 0.24786855280399323, + "step": 12540 + }, + { + "ce_loss": 0.024127228185534477, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "distill_loss": 0.13936744630336761, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "ref_ce_loss": 0.06584622710943222, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "loss": 0.29231300950050354, + "step": 12540 + }, + { + "ce_loss": 0.10617182403802872, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "distill_loss": 0.11738552898168564, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "ref_ce_loss": 0.06845077127218246, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "loss": 0.248124897480011, + "step": 12540 + }, + { + "ce_loss": 0.03263659402728081, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "distill_loss": 0.09977734833955765, + "epoch": 4.182788525683789, + "step": 12540 + }, + { + "epoch": 4.182788525683789, + "ref_ce_loss": 0.07695674896240234, + "step": 12540 + }, + { + "epoch": 4.186124082721815, + "loss": 0.3926, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "grad_norm": 7.9306321144104, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "learning_rate": 0.00014598976167161964, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "loss": 0.6581223011016846, + "step": 12550 + }, + { + "ce_loss": 0.12450020760297775, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "distill_loss": 0.16247011721134186, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "ref_ce_loss": 0.14108185470104218, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "loss": 0.22678981721401215, + "step": 12550 + }, + { + "ce_loss": 0.05303997918963432, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "distill_loss": 0.11390000581741333, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "ref_ce_loss": 0.059662774205207825, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "loss": 0.3068074584007263, + "step": 12550 + }, + { + "ce_loss": 0.11068800836801529, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "distill_loss": 0.09961576014757156, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "ref_ce_loss": 0.07074546813964844, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "loss": 0.4554699957370758, + "step": 12550 + }, + { + "ce_loss": 0.13982751965522766, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "distill_loss": 0.14476770162582397, + "epoch": 4.186124082721815, + "step": 12550 + }, + { + "epoch": 4.186124082721815, + "ref_ce_loss": 0.12475011497735977, + "step": 12550 + }, + { + "epoch": 4.18945963975984, + "loss": 0.4378, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "grad_norm": 2.6521835327148438, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "learning_rate": 0.00014578727636911773, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "loss": 0.4609094262123108, + "step": 12560 + }, + { + "ce_loss": 0.14568175375461578, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "distill_loss": 0.16340190172195435, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "ref_ce_loss": 0.10280240327119827, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "loss": 0.22618383169174194, + "step": 12560 + }, + { + "ce_loss": 0.04781392216682434, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "distill_loss": 0.09708165377378464, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "ref_ce_loss": 0.05278674140572548, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "loss": 0.3315080404281616, + "step": 12560 + }, + { + "ce_loss": 0.06448005139827728, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "distill_loss": 0.12131848186254501, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "ref_ce_loss": 0.07438777387142181, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "loss": 0.6073876023292542, + "step": 12560 + }, + { + "ce_loss": 0.21663253009319305, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "distill_loss": 0.18993952870368958, + "epoch": 4.18945963975984, + "step": 12560 + }, + { + "epoch": 4.18945963975984, + "ref_ce_loss": 0.13462689518928528, + "step": 12560 + }, + { + "epoch": 4.192795196797865, + "loss": 0.4217, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "grad_norm": 3.1643049716949463, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "learning_rate": 0.0001455847987489619, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "loss": 0.37981656193733215, + "step": 12570 + }, + { + "ce_loss": 0.07878326624631882, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "distill_loss": 0.14073777198791504, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "ref_ce_loss": 0.12595392763614655, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "loss": 0.5180966854095459, + "step": 12570 + }, + { + "ce_loss": 0.12497790902853012, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "distill_loss": 0.11337775737047195, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "ref_ce_loss": 0.05922972410917282, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "loss": 0.23794496059417725, + "step": 12570 + }, + { + "ce_loss": 0.046828463673591614, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "distill_loss": 0.0907125473022461, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "ref_ce_loss": 0.06597310304641724, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "loss": 0.23115824162960052, + "step": 12570 + }, + { + "ce_loss": 0.04305850341916084, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "distill_loss": 0.09254015237092972, + "epoch": 4.192795196797865, + "step": 12570 + }, + { + "epoch": 4.192795196797865, + "ref_ce_loss": 0.09520301222801208, + "step": 12570 + }, + { + "epoch": 4.196130753835891, + "loss": 0.4201, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "grad_norm": 2.1938881874084473, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "learning_rate": 0.0001453823291803915, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "loss": 0.3288840055465698, + "step": 12580 + }, + { + "ce_loss": 0.11204256117343903, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "distill_loss": 0.10862526297569275, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "ref_ce_loss": 0.07976261526346207, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "loss": 0.4981669783592224, + "step": 12580 + }, + { + "ce_loss": 0.14517666399478912, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "distill_loss": 0.1696842610836029, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "ref_ce_loss": 0.10310443490743637, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "loss": 0.3159114718437195, + "step": 12580 + }, + { + "ce_loss": 0.07832693308591843, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "distill_loss": 0.09905324131250381, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "ref_ce_loss": 0.07301445305347443, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "loss": 0.3570947051048279, + "step": 12580 + }, + { + "ce_loss": 0.112045057117939, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "distill_loss": 0.12575042247772217, + "epoch": 4.196130753835891, + "step": 12580 + }, + { + "epoch": 4.196130753835891, + "ref_ce_loss": 0.08894684910774231, + "step": 12580 + }, + { + "epoch": 4.199466310873916, + "loss": 0.4287, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "grad_norm": 2.1162467002868652, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "learning_rate": 0.00014517986803263115, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "loss": 0.3382636606693268, + "step": 12590 + }, + { + "ce_loss": 0.10414419323205948, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "distill_loss": 0.14084231853485107, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "ref_ce_loss": 0.09309055656194687, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "loss": 0.41425347328186035, + "step": 12590 + }, + { + "ce_loss": 0.09124680608510971, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "distill_loss": 0.11330849677324295, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "ref_ce_loss": 0.09014124423265457, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "loss": 0.6550116539001465, + "step": 12590 + }, + { + "ce_loss": 0.19489094614982605, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "distill_loss": 0.2267588973045349, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "ref_ce_loss": 0.08232640475034714, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "loss": 0.5072152614593506, + "step": 12590 + }, + { + "ce_loss": 0.0739305317401886, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "distill_loss": 0.1428247094154358, + "epoch": 4.199466310873916, + "step": 12590 + }, + { + "epoch": 4.199466310873916, + "ref_ce_loss": 0.08314472436904907, + "step": 12590 + }, + { + "epoch": 4.202801867911941, + "loss": 0.4778, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "grad_norm": 5.220403671264648, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "learning_rate": 0.00014497741567489012, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "loss": 0.4604666829109192, + "step": 12600 + }, + { + "ce_loss": 0.1026303842663765, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "distill_loss": 0.12971565127372742, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "ref_ce_loss": 0.09788954257965088, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "loss": 0.3484407067298889, + "step": 12600 + }, + { + "ce_loss": 0.05491747707128525, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "distill_loss": 0.12664100527763367, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "ref_ce_loss": 0.08006755262613297, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "loss": 0.28191614151000977, + "step": 12600 + }, + { + "ce_loss": 0.029443496838212013, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "distill_loss": 0.10391257703304291, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "ref_ce_loss": 0.10279714316129684, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "loss": 0.4100857377052307, + "step": 12600 + }, + { + "ce_loss": 0.13263317942619324, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "distill_loss": 0.12876605987548828, + "epoch": 4.202801867911941, + "step": 12600 + }, + { + "epoch": 4.202801867911941, + "ref_ce_loss": 0.10536639392375946, + "step": 12600 + }, + { + "epoch": 4.206137424949967, + "loss": 0.3917, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "grad_norm": 2.95518159866333, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "learning_rate": 0.00014477497247636167, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "loss": 0.33207955956459045, + "step": 12610 + }, + { + "ce_loss": 0.05851322412490845, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "distill_loss": 0.12857557833194733, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "ref_ce_loss": 0.04979289323091507, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "loss": 0.3608654737472534, + "step": 12610 + }, + { + "ce_loss": 0.13061387836933136, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "distill_loss": 0.10170701146125793, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "ref_ce_loss": 0.064691923558712, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "loss": 0.5522586703300476, + "step": 12610 + }, + { + "ce_loss": 0.12592393159866333, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "distill_loss": 0.17105364799499512, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "ref_ce_loss": 0.12697115540504456, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "loss": 0.27305784821510315, + "step": 12610 + }, + { + "ce_loss": 0.06921637058258057, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "distill_loss": 0.12910543382167816, + "epoch": 4.206137424949967, + "step": 12610 + }, + { + "epoch": 4.206137424949967, + "ref_ce_loss": 0.05369148030877113, + "step": 12610 + }, + { + "epoch": 4.209472981987992, + "loss": 0.4288, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "grad_norm": 2.8391435146331787, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "learning_rate": 0.0001445725388062223, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "loss": 0.586120069026947, + "step": 12620 + }, + { + "ce_loss": 0.13455958664417267, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "distill_loss": 0.18541017174720764, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "ref_ce_loss": 0.09259842336177826, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "loss": 0.6589750051498413, + "step": 12620 + }, + { + "ce_loss": 0.19765350222587585, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "distill_loss": 0.15606580674648285, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "ref_ce_loss": 0.12804923951625824, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "loss": 0.26120057702064514, + "step": 12620 + }, + { + "ce_loss": 0.08098047226667404, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "distill_loss": 0.09746217727661133, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "ref_ce_loss": 0.05925130099058151, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "loss": 0.2098916471004486, + "step": 12620 + }, + { + "ce_loss": 0.03318024426698685, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "distill_loss": 0.12412787973880768, + "epoch": 4.209472981987992, + "step": 12620 + }, + { + "epoch": 4.209472981987992, + "ref_ce_loss": 0.05251970887184143, + "step": 12620 + }, + { + "epoch": 4.2128085390260175, + "loss": 0.427, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "grad_norm": 2.3983237743377686, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "learning_rate": 0.00014437011503363117, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "loss": 0.4466587007045746, + "step": 12630 + }, + { + "ce_loss": 0.12958595156669617, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "distill_loss": 0.15985801815986633, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "ref_ce_loss": 0.09321147948503494, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "loss": 0.40337681770324707, + "step": 12630 + }, + { + "ce_loss": 0.1810450404882431, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "distill_loss": 0.1132136881351471, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "ref_ce_loss": 0.08407022804021835, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "loss": 0.6117267608642578, + "step": 12630 + }, + { + "ce_loss": 0.18386036157608032, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "distill_loss": 0.1988612860441208, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "ref_ce_loss": 0.12381020188331604, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "loss": 0.3889603018760681, + "step": 12630 + }, + { + "ce_loss": 0.11408506333827972, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "distill_loss": 0.17046092450618744, + "epoch": 4.2128085390260175, + "step": 12630 + }, + { + "epoch": 4.2128085390260175, + "ref_ce_loss": 0.10424398630857468, + "step": 12630 + }, + { + "epoch": 4.216144096064043, + "loss": 0.4039, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "grad_norm": 2.042799472808838, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "learning_rate": 0.0001441677015277295, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "loss": 0.5464071035385132, + "step": 12640 + }, + { + "ce_loss": 0.06709546595811844, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "distill_loss": 0.15532134473323822, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "ref_ce_loss": 0.0987556055188179, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "loss": 0.35175639390945435, + "step": 12640 + }, + { + "ce_loss": 0.09659342467784882, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "distill_loss": 0.12190660834312439, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "ref_ce_loss": 0.11512437462806702, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "loss": 0.37998417019844055, + "step": 12640 + }, + { + "ce_loss": 0.07071369886398315, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "distill_loss": 0.14697347581386566, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "ref_ce_loss": 0.1176176369190216, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "loss": 0.28571704030036926, + "step": 12640 + }, + { + "ce_loss": 0.047390177845954895, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "distill_loss": 0.10813229531049728, + "epoch": 4.216144096064043, + "step": 12640 + }, + { + "epoch": 4.216144096064043, + "ref_ce_loss": 0.08257561177015305, + "step": 12640 + }, + { + "epoch": 4.219479653102068, + "loss": 0.3949, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "grad_norm": 2.189134359359741, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "learning_rate": 0.00014396529865763947, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "loss": 0.2742716073989868, + "step": 12650 + }, + { + "ce_loss": 0.04334789887070656, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "distill_loss": 0.10433655232191086, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "ref_ce_loss": 0.07580353319644928, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "loss": 0.39648038148880005, + "step": 12650 + }, + { + "ce_loss": 0.08383668214082718, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "distill_loss": 0.12050480395555496, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "ref_ce_loss": 0.09287303686141968, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "loss": 0.46412718296051025, + "step": 12650 + }, + { + "ce_loss": 0.09492798149585724, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "distill_loss": 0.16380393505096436, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "ref_ce_loss": 0.08897554874420166, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "loss": 0.36522769927978516, + "step": 12650 + }, + { + "ce_loss": 0.07890743762254715, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "distill_loss": 0.1172899603843689, + "epoch": 4.219479653102068, + "step": 12650 + }, + { + "epoch": 4.219479653102068, + "ref_ce_loss": 0.07658599317073822, + "step": 12650 + }, + { + "epoch": 4.2228152101400935, + "loss": 0.3507, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "grad_norm": 1.6064761877059937, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "learning_rate": 0.0001437629067924643, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "loss": 0.3317145109176636, + "step": 12660 + }, + { + "ce_loss": 0.07839033752679825, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "distill_loss": 0.10722742974758148, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "ref_ce_loss": 0.11385882645845413, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "loss": 0.32069358229637146, + "step": 12660 + }, + { + "ce_loss": 0.10389983654022217, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "distill_loss": 0.14690223336219788, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "ref_ce_loss": 0.06953489780426025, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "loss": 0.4394501745700836, + "step": 12660 + }, + { + "ce_loss": 0.17041996121406555, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "distill_loss": 0.11526565998792648, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "ref_ce_loss": 0.09521762281656265, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "loss": 0.4944007396697998, + "step": 12660 + }, + { + "ce_loss": 0.14096632599830627, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "distill_loss": 0.12511637806892395, + "epoch": 4.2228152101400935, + "step": 12660 + }, + { + "epoch": 4.2228152101400935, + "ref_ce_loss": 0.09930511564016342, + "step": 12660 + }, + { + "epoch": 4.226150767178119, + "loss": 0.4517, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "grad_norm": 3.4801747798919678, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "learning_rate": 0.00014356052630128675, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "loss": 0.4144769310951233, + "step": 12670 + }, + { + "ce_loss": 0.09750276803970337, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "distill_loss": 0.1400170475244522, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "ref_ce_loss": 0.0747247189283371, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "loss": 0.2999867796897888, + "step": 12670 + }, + { + "ce_loss": 0.05919264629483223, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "distill_loss": 0.12513509392738342, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "ref_ce_loss": 0.08204268664121628, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "loss": 0.4107738435268402, + "step": 12670 + }, + { + "ce_loss": 0.1336008906364441, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "distill_loss": 0.15651835501194, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "ref_ce_loss": 0.08766976743936539, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "loss": 0.43431609869003296, + "step": 12670 + }, + { + "ce_loss": 0.12974825501441956, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "distill_loss": 0.11243186146020889, + "epoch": 4.226150767178119, + "step": 12670 + }, + { + "epoch": 4.226150767178119, + "ref_ce_loss": 0.09981807321310043, + "step": 12670 + }, + { + "epoch": 4.229486324216144, + "loss": 0.3862, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "grad_norm": 1.9466087818145752, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "learning_rate": 0.00014335815755316903, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "loss": 0.46811777353286743, + "step": 12680 + }, + { + "ce_loss": 0.10368335247039795, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "distill_loss": 0.11334482580423355, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "ref_ce_loss": 0.09854672104120255, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "loss": 0.49574559926986694, + "step": 12680 + }, + { + "ce_loss": 0.0772734060883522, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "distill_loss": 0.13109725713729858, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "ref_ce_loss": 0.06632824242115021, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "loss": 0.2777520418167114, + "step": 12680 + }, + { + "ce_loss": 0.04080289229750633, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "distill_loss": 0.09627961367368698, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "ref_ce_loss": 0.05981730669736862, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "loss": 0.43288472294807434, + "step": 12680 + }, + { + "ce_loss": 0.1106693223118782, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "distill_loss": 0.1380157321691513, + "epoch": 4.229486324216144, + "step": 12680 + }, + { + "epoch": 4.229486324216144, + "ref_ce_loss": 0.12508343160152435, + "step": 12680 + }, + { + "epoch": 4.23282188125417, + "loss": 0.3997, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "grad_norm": 2.4736568927764893, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "learning_rate": 0.00014315580091715202, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "loss": 0.3552623391151428, + "step": 12690 + }, + { + "ce_loss": 0.08149318397045135, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "distill_loss": 0.1482114940881729, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "ref_ce_loss": 0.08624199777841568, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "loss": 0.3837560713291168, + "step": 12690 + }, + { + "ce_loss": 0.12796638906002045, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "distill_loss": 0.1531001329421997, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "ref_ce_loss": 0.07761276513338089, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "loss": 0.4810293912887573, + "step": 12690 + }, + { + "ce_loss": 0.16699600219726562, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "distill_loss": 0.12791535258293152, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "ref_ce_loss": 0.15135811269283295, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "loss": 0.5207319855690002, + "step": 12690 + }, + { + "ce_loss": 0.1772138774394989, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "distill_loss": 0.16876177489757538, + "epoch": 4.23282188125417, + "step": 12690 + }, + { + "epoch": 4.23282188125417, + "ref_ce_loss": 0.06250439584255219, + "step": 12690 + }, + { + "epoch": 4.236157438292195, + "loss": 0.4372, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "grad_norm": 3.4386374950408936, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "learning_rate": 0.00014295345676225427, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "loss": 0.6116504669189453, + "step": 12700 + }, + { + "ce_loss": 0.12445548921823502, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "distill_loss": 0.18399251997470856, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "ref_ce_loss": 0.10828568041324615, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "loss": 0.41157230734825134, + "step": 12700 + }, + { + "ce_loss": 0.058100927621126175, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "distill_loss": 0.12735587358474731, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "ref_ce_loss": 0.08694470673799515, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "loss": 0.29641038179397583, + "step": 12700 + }, + { + "ce_loss": 0.05376739799976349, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "distill_loss": 0.15917444229125977, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "ref_ce_loss": 0.05772160366177559, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "loss": 0.37515804171562195, + "step": 12700 + }, + { + "ce_loss": 0.08279716223478317, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "distill_loss": 0.15383553504943848, + "epoch": 4.236157438292195, + "step": 12700 + }, + { + "epoch": 4.236157438292195, + "ref_ce_loss": 0.11051376909017563, + "step": 12700 + }, + { + "epoch": 4.23949299533022, + "loss": 0.4558, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "grad_norm": 2.184640884399414, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "learning_rate": 0.0001427511254574717, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "loss": 0.5931957960128784, + "step": 12710 + }, + { + "ce_loss": 0.16643624007701874, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "distill_loss": 0.168610081076622, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "ref_ce_loss": 0.07215865701436996, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "loss": 0.4027775526046753, + "step": 12710 + }, + { + "ce_loss": 0.12669742107391357, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "distill_loss": 0.16142867505550385, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "ref_ce_loss": 0.09073889255523682, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "loss": 0.5018163323402405, + "step": 12710 + }, + { + "ce_loss": 0.15854166448116302, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "distill_loss": 0.18602553009986877, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "ref_ce_loss": 0.11441418528556824, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "loss": 0.3939116299152374, + "step": 12710 + }, + { + "ce_loss": 0.06274707615375519, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "distill_loss": 0.11968748271465302, + "epoch": 4.23949299533022, + "step": 12710 + }, + { + "epoch": 4.23949299533022, + "ref_ce_loss": 0.12243720889091492, + "step": 12710 + }, + { + "epoch": 4.242828552368246, + "loss": 0.4039, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "grad_norm": 1.5773797035217285, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "learning_rate": 0.00014254880737177696, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "loss": 0.750359296798706, + "step": 12720 + }, + { + "ce_loss": 0.11326040327548981, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "distill_loss": 0.1426267921924591, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "ref_ce_loss": 0.08958389610052109, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "loss": 0.8291692733764648, + "step": 12720 + }, + { + "ce_loss": 0.10521968454122543, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "distill_loss": 0.15522602200508118, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "ref_ce_loss": 0.10163272172212601, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "loss": 0.3573615849018097, + "step": 12720 + }, + { + "ce_loss": 0.08448813855648041, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "distill_loss": 0.1453476995229721, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "ref_ce_loss": 0.08841101080179214, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "loss": 0.3858444094657898, + "step": 12720 + }, + { + "ce_loss": 0.1092870756983757, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "distill_loss": 0.1326705813407898, + "epoch": 4.242828552368246, + "step": 12720 + }, + { + "epoch": 4.242828552368246, + "ref_ce_loss": 0.11946248263120651, + "step": 12720 + }, + { + "epoch": 4.246164109406271, + "loss": 0.4509, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "grad_norm": 2.041639804840088, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "learning_rate": 0.00014234650287411825, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "loss": 0.5689536929130554, + "step": 12730 + }, + { + "ce_loss": 0.05791715532541275, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "distill_loss": 0.09933005273342133, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "ref_ce_loss": 0.05065986141562462, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "loss": 0.4074205160140991, + "step": 12730 + }, + { + "ce_loss": 0.13159328699111938, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "distill_loss": 0.18942898511886597, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "ref_ce_loss": 0.06139923259615898, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "loss": 0.28600767254829407, + "step": 12730 + }, + { + "ce_loss": 0.0682460144162178, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "distill_loss": 0.11950859427452087, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "ref_ce_loss": 0.05969545617699623, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "loss": 0.2821713984012604, + "step": 12730 + }, + { + "ce_loss": 0.08232744038105011, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "distill_loss": 0.10216699540615082, + "epoch": 4.246164109406271, + "step": 12730 + }, + { + "epoch": 4.246164109406271, + "ref_ce_loss": 0.0690549984574318, + "step": 12730 + }, + { + "epoch": 4.249499666444296, + "loss": 0.4278, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "grad_norm": 2.6584384441375732, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "learning_rate": 0.00014214421233341927, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "loss": 0.38530316948890686, + "step": 12740 + }, + { + "ce_loss": 0.06119230389595032, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "distill_loss": 0.13711406290531158, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "ref_ce_loss": 0.09056949615478516, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "loss": 0.3610149919986725, + "step": 12740 + }, + { + "ce_loss": 0.1303795576095581, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "distill_loss": 0.10823221504688263, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "ref_ce_loss": 0.12236493825912476, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "loss": 0.37968602776527405, + "step": 12740 + }, + { + "ce_loss": 0.11002121865749359, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "distill_loss": 0.15049538016319275, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "ref_ce_loss": 0.08402389287948608, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "loss": 0.30963027477264404, + "step": 12740 + }, + { + "ce_loss": 0.09638215601444244, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "distill_loss": 0.10256923735141754, + "epoch": 4.249499666444296, + "step": 12740 + }, + { + "epoch": 4.249499666444296, + "ref_ce_loss": 0.08142927289009094, + "step": 12740 + }, + { + "epoch": 4.252835223482322, + "loss": 0.4419, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "grad_norm": 2.8495850563049316, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "learning_rate": 0.0001419419361185781, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "loss": 0.49968162178993225, + "step": 12750 + }, + { + "ce_loss": 0.11410761624574661, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "distill_loss": 0.2119593620300293, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "ref_ce_loss": 0.09284225106239319, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "loss": 0.49200567603111267, + "step": 12750 + }, + { + "ce_loss": 0.18147218227386475, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "distill_loss": 0.16002869606018066, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "ref_ce_loss": 0.057403866201639175, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "loss": 0.32698220014572144, + "step": 12750 + }, + { + "ce_loss": 0.08053304255008698, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "distill_loss": 0.16222812235355377, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "ref_ce_loss": 0.08418859541416168, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "loss": 0.37825435400009155, + "step": 12750 + }, + { + "ce_loss": 0.11178170144557953, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "distill_loss": 0.13081073760986328, + "epoch": 4.252835223482322, + "step": 12750 + }, + { + "epoch": 4.252835223482322, + "ref_ce_loss": 0.09388069063425064, + "step": 12750 + }, + { + "epoch": 4.256170780520347, + "loss": 0.4337, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "grad_norm": 3.041410207748413, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "learning_rate": 0.00014173967459846684, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "loss": 0.4082252085208893, + "step": 12760 + }, + { + "ce_loss": 0.07274634391069412, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "distill_loss": 0.18472713232040405, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "ref_ce_loss": 0.08851490914821625, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "loss": 0.4447764456272125, + "step": 12760 + }, + { + "ce_loss": 0.15090270340442657, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "distill_loss": 0.11900592595338821, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "ref_ce_loss": 0.0788751021027565, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "loss": 0.502668023109436, + "step": 12760 + }, + { + "ce_loss": 0.09444031119346619, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "distill_loss": 0.1498761624097824, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "ref_ce_loss": 0.10147394984960556, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "loss": 0.7868169546127319, + "step": 12760 + }, + { + "ce_loss": 0.14243212342262268, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "distill_loss": 0.11945310980081558, + "epoch": 4.256170780520347, + "step": 12760 + }, + { + "epoch": 4.256170780520347, + "ref_ce_loss": 0.04398519545793533, + "step": 12760 + }, + { + "epoch": 4.259506337558372, + "loss": 0.441, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "grad_norm": 2.3769431114196777, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "learning_rate": 0.00014153742814193066, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "loss": 0.23193150758743286, + "step": 12770 + }, + { + "ce_loss": 0.018042435869574547, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "distill_loss": 0.11690559983253479, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "ref_ce_loss": 0.09686476737260818, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "loss": 0.38023316860198975, + "step": 12770 + }, + { + "ce_loss": 0.14664292335510254, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "distill_loss": 0.11862847208976746, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "ref_ce_loss": 0.09226536750793457, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "loss": 0.3144304156303406, + "step": 12770 + }, + { + "ce_loss": 0.07538256794214249, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "distill_loss": 0.13220840692520142, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "ref_ce_loss": 0.0619903989136219, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "loss": 0.5431100130081177, + "step": 12770 + }, + { + "ce_loss": 0.11705781519412994, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "distill_loss": 0.13160298764705658, + "epoch": 4.259506337558372, + "step": 12770 + }, + { + "epoch": 4.259506337558372, + "ref_ce_loss": 0.1292382925748825, + "step": 12770 + }, + { + "epoch": 4.262841894596398, + "loss": 0.4342, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "grad_norm": 4.000396728515625, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "learning_rate": 0.00014133519711778734, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "loss": 0.608391523361206, + "step": 12780 + }, + { + "ce_loss": 0.1167433112859726, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "distill_loss": 0.16401061415672302, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "ref_ce_loss": 0.09299744665622711, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "loss": 0.4936920404434204, + "step": 12780 + }, + { + "ce_loss": 0.11663345247507095, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "distill_loss": 0.15554045140743256, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "ref_ce_loss": 0.0900701954960823, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "loss": 0.29712164402008057, + "step": 12780 + }, + { + "ce_loss": 0.10573622584342957, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "distill_loss": 0.0938921868801117, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "ref_ce_loss": 0.07227256894111633, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "loss": 0.2216934859752655, + "step": 12780 + }, + { + "ce_loss": 0.037140414118766785, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "distill_loss": 0.11072830855846405, + "epoch": 4.262841894596398, + "step": 12780 + }, + { + "epoch": 4.262841894596398, + "ref_ce_loss": 0.05503278598189354, + "step": 12780 + }, + { + "epoch": 4.266177451634423, + "loss": 0.4581, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "grad_norm": 2.528873920440674, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "learning_rate": 0.00014113298189482652, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "loss": 0.29387056827545166, + "step": 12790 + }, + { + "ce_loss": 0.0636768564581871, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "distill_loss": 0.12197493016719818, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "ref_ce_loss": 0.08008244633674622, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "loss": 0.35066884756088257, + "step": 12790 + }, + { + "ce_loss": 0.08643355220556259, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "distill_loss": 0.1131490170955658, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "ref_ce_loss": 0.11212163418531418, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "loss": 0.5954700708389282, + "step": 12790 + }, + { + "ce_loss": 0.1914316564798355, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "distill_loss": 0.1783093959093094, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "ref_ce_loss": 0.1309003233909607, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "loss": 0.2525702714920044, + "step": 12790 + }, + { + "ce_loss": 0.05564585328102112, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "distill_loss": 0.11780757457017899, + "epoch": 4.266177451634423, + "step": 12790 + }, + { + "epoch": 4.266177451634423, + "ref_ce_loss": 0.07907214760780334, + "step": 12790 + }, + { + "epoch": 4.269513008672448, + "loss": 0.4115, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "grad_norm": 3.688870906829834, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "learning_rate": 0.00014093078284180892, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "loss": 0.2441685050725937, + "step": 12800 + }, + { + "ce_loss": 0.07012347877025604, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "distill_loss": 0.0923108235001564, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "ref_ce_loss": 0.06695084273815155, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "loss": 0.42757582664489746, + "step": 12800 + }, + { + "ce_loss": 0.13388575613498688, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "distill_loss": 0.1347452849149704, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "ref_ce_loss": 0.09412025660276413, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "loss": 0.42638054490089417, + "step": 12800 + }, + { + "ce_loss": 0.16536660492420197, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "distill_loss": 0.18622267246246338, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "ref_ce_loss": 0.07472360134124756, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "loss": 0.42994916439056396, + "step": 12800 + }, + { + "ce_loss": 0.1114339604973793, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "distill_loss": 0.13885322213172913, + "epoch": 4.269513008672448, + "step": 12800 + }, + { + "epoch": 4.269513008672448, + "ref_ce_loss": 0.1167064681649208, + "step": 12800 + }, + { + "epoch": 4.272848565710474, + "loss": 0.4355, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "grad_norm": 2.8595855236053467, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "learning_rate": 0.00014072860032746592, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "loss": 0.5688683390617371, + "step": 12810 + }, + { + "ce_loss": 0.1201975867152214, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "distill_loss": 0.1736508160829544, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "ref_ce_loss": 0.08065781742334366, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "loss": 0.5388529300689697, + "step": 12810 + }, + { + "ce_loss": 0.13207614421844482, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "distill_loss": 0.22794324159622192, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "ref_ce_loss": 0.10184848308563232, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "loss": 0.39517244696617126, + "step": 12810 + }, + { + "ce_loss": 0.059914980083703995, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "distill_loss": 0.16114996373653412, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "ref_ce_loss": 0.08862312138080597, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "loss": 0.4757397472858429, + "step": 12810 + }, + { + "ce_loss": 0.12424880266189575, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "distill_loss": 0.15512457489967346, + "epoch": 4.272848565710474, + "step": 12810 + }, + { + "epoch": 4.272848565710474, + "ref_ce_loss": 0.07396658509969711, + "step": 12810 + }, + { + "epoch": 4.276184122748499, + "loss": 0.4181, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "grad_norm": 3.4736826419830322, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "learning_rate": 0.0001405264347204987, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "loss": 0.4792317748069763, + "step": 12820 + }, + { + "ce_loss": 0.07258214801549911, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "distill_loss": 0.16453388333320618, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "ref_ce_loss": 0.08609256148338318, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "loss": 0.32401522994041443, + "step": 12820 + }, + { + "ce_loss": 0.10857275873422623, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "distill_loss": 0.11418082565069199, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "ref_ce_loss": 0.08071093261241913, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "loss": 0.309609055519104, + "step": 12820 + }, + { + "ce_loss": 0.07577096670866013, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "distill_loss": 0.1278049200773239, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "ref_ce_loss": 0.075919009745121, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "loss": 0.39440685510635376, + "step": 12820 + }, + { + "ce_loss": 0.1170484647154808, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "distill_loss": 0.14061780273914337, + "epoch": 4.276184122748499, + "step": 12820 + }, + { + "epoch": 4.276184122748499, + "ref_ce_loss": 0.08952930569648743, + "step": 12820 + }, + { + "epoch": 4.2795196797865245, + "loss": 0.4295, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "grad_norm": 1.8937972784042358, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "learning_rate": 0.00014032428638957747, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "loss": 0.4438021779060364, + "step": 12830 + }, + { + "ce_loss": 0.1217060387134552, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "distill_loss": 0.1110411062836647, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "ref_ce_loss": 0.14042538404464722, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "loss": 0.30124080181121826, + "step": 12830 + }, + { + "ce_loss": 0.05258672684431076, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "distill_loss": 0.09894955903291702, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "ref_ce_loss": 0.08771368116140366, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "loss": 0.7485233545303345, + "step": 12830 + }, + { + "ce_loss": 0.046000298112630844, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "distill_loss": 0.10718851536512375, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "ref_ce_loss": 0.08277367800474167, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "loss": 0.4868257939815521, + "step": 12830 + }, + { + "ce_loss": 0.15199749171733856, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "distill_loss": 0.15022709965705872, + "epoch": 4.2795196797865245, + "step": 12830 + }, + { + "epoch": 4.2795196797865245, + "ref_ce_loss": 0.06878019124269485, + "step": 12830 + }, + { + "epoch": 4.28285523682455, + "loss": 0.3795, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "grad_norm": 2.2695956230163574, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "learning_rate": 0.0001401221557033411, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "loss": 0.4565892815589905, + "step": 12840 + }, + { + "ce_loss": 0.10048975050449371, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "distill_loss": 0.12798023223876953, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "ref_ce_loss": 0.11618823558092117, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "loss": 0.36068567633628845, + "step": 12840 + }, + { + "ce_loss": 0.09661982953548431, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "distill_loss": 0.15107427537441254, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "ref_ce_loss": 0.08204665035009384, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "loss": 0.3397977948188782, + "step": 12840 + }, + { + "ce_loss": 0.08543965220451355, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "distill_loss": 0.15523698925971985, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "ref_ce_loss": 0.07969725877046585, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "loss": 0.49361568689346313, + "step": 12840 + }, + { + "ce_loss": 0.1492895632982254, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "distill_loss": 0.18394288420677185, + "epoch": 4.28285523682455, + "step": 12840 + }, + { + "epoch": 4.28285523682455, + "ref_ce_loss": 0.1353003978729248, + "step": 12840 + }, + { + "epoch": 4.286190793862575, + "loss": 0.4424, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "grad_norm": 2.4746813774108887, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "learning_rate": 0.0001399200430303963, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "loss": 0.6018202900886536, + "step": 12850 + }, + { + "ce_loss": 0.16130967438220978, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "distill_loss": 0.22548778355121613, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "ref_ce_loss": 0.11540760099887848, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "loss": 0.5410385727882385, + "step": 12850 + }, + { + "ce_loss": 0.13062411546707153, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "distill_loss": 0.17460712790489197, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "ref_ce_loss": 0.1254759281873703, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "loss": 0.7663313150405884, + "step": 12850 + }, + { + "ce_loss": 0.07273073494434357, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "distill_loss": 0.11530451476573944, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "ref_ce_loss": 0.058275263756513596, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "loss": 0.5315554141998291, + "step": 12850 + }, + { + "ce_loss": 0.1627359837293625, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "distill_loss": 0.204071044921875, + "epoch": 4.286190793862575, + "step": 12850 + }, + { + "epoch": 4.286190793862575, + "ref_ce_loss": 0.130750373005867, + "step": 12850 + }, + { + "epoch": 4.2895263509006005, + "loss": 0.4477, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "grad_norm": 2.307910203933716, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "learning_rate": 0.00013971794873931674, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "loss": 0.47329601645469666, + "step": 12860 + }, + { + "ce_loss": 0.16464509069919586, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "distill_loss": 0.1366921365261078, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "ref_ce_loss": 0.1367800086736679, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "loss": 0.7709516286849976, + "step": 12860 + }, + { + "ce_loss": 0.09364837408065796, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "distill_loss": 0.12354176491498947, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "ref_ce_loss": 0.07555168122053146, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "loss": 0.4722943902015686, + "step": 12860 + }, + { + "ce_loss": 0.12364023178815842, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "distill_loss": 0.11006634682416916, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "ref_ce_loss": 0.12410124391317368, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "loss": 0.44431930780410767, + "step": 12860 + }, + { + "ce_loss": 0.1067247986793518, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "distill_loss": 0.14513981342315674, + "epoch": 4.2895263509006005, + "step": 12860 + }, + { + "epoch": 4.2895263509006005, + "ref_ce_loss": 0.1249135211110115, + "step": 12860 + }, + { + "epoch": 4.292861907938626, + "loss": 0.4131, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "grad_norm": 2.4325857162475586, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "learning_rate": 0.0001395158731986428, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "loss": 0.5351008772850037, + "step": 12870 + }, + { + "ce_loss": 0.16410230100154877, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "distill_loss": 0.19365465641021729, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "ref_ce_loss": 0.08920037001371384, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "loss": 0.4966471791267395, + "step": 12870 + }, + { + "ce_loss": 0.16271260380744934, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "distill_loss": 0.11182349920272827, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "ref_ce_loss": 0.10847348719835281, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "loss": 0.3838697075843811, + "step": 12870 + }, + { + "ce_loss": 0.0985030084848404, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "distill_loss": 0.10047741234302521, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "ref_ce_loss": 0.0935835912823677, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "loss": 0.3411202132701874, + "step": 12870 + }, + { + "ce_loss": 0.08071853220462799, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "distill_loss": 0.16016238927841187, + "epoch": 4.292861907938626, + "step": 12870 + }, + { + "epoch": 4.292861907938626, + "ref_ce_loss": 0.05927430838346481, + "step": 12870 + }, + { + "epoch": 4.296197464976651, + "loss": 0.3782, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "grad_norm": 2.5318892002105713, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "learning_rate": 0.00013931381677688044, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "loss": 0.2524895966053009, + "step": 12880 + }, + { + "ce_loss": 0.024324163794517517, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "distill_loss": 0.14109152555465698, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "ref_ce_loss": 0.08681479096412659, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "loss": 0.2724023759365082, + "step": 12880 + }, + { + "ce_loss": 0.04453759267926216, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "distill_loss": 0.1311105340719223, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "ref_ce_loss": 0.09659942239522934, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "loss": 0.5654150247573853, + "step": 12880 + }, + { + "ce_loss": 0.10163300484418869, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "distill_loss": 0.10299927741289139, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "ref_ce_loss": 0.07143683731555939, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "loss": 0.29135653376579285, + "step": 12880 + }, + { + "ce_loss": 0.04858585447072983, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "distill_loss": 0.14956815540790558, + "epoch": 4.296197464976651, + "step": 12880 + }, + { + "epoch": 4.296197464976651, + "ref_ce_loss": 0.09311891347169876, + "step": 12880 + }, + { + "epoch": 4.299533022014677, + "loss": 0.4558, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "grad_norm": 3.607006072998047, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "learning_rate": 0.0001391117798425009, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "loss": 0.588657021522522, + "step": 12890 + }, + { + "ce_loss": 0.05568239837884903, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "distill_loss": 0.09379260241985321, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "ref_ce_loss": 0.10455752909183502, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "loss": 0.29446786642074585, + "step": 12890 + }, + { + "ce_loss": 0.07457101345062256, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "distill_loss": 0.13319659233093262, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "ref_ce_loss": 0.06350753456354141, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "loss": 0.4449309706687927, + "step": 12890 + }, + { + "ce_loss": 0.0714930072426796, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "distill_loss": 0.149851456284523, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "ref_ce_loss": 0.08158783614635468, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "loss": 0.5386632084846497, + "step": 12890 + }, + { + "ce_loss": 0.22696617245674133, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "distill_loss": 0.15260447561740875, + "epoch": 4.299533022014677, + "step": 12890 + }, + { + "epoch": 4.299533022014677, + "ref_ce_loss": 0.11920776963233948, + "step": 12890 + }, + { + "epoch": 4.302868579052702, + "loss": 0.4182, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "grad_norm": 1.9943865537643433, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "learning_rate": 0.00013890976276393998, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "loss": 0.6233004331588745, + "step": 12900 + }, + { + "ce_loss": 0.1952550858259201, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "distill_loss": 0.16152338683605194, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "ref_ce_loss": 0.11507376283407211, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "loss": 0.3819960355758667, + "step": 12900 + }, + { + "ce_loss": 0.0760917067527771, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "distill_loss": 0.12423792481422424, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "ref_ce_loss": 0.13592945039272308, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "loss": 0.32491952180862427, + "step": 12900 + }, + { + "ce_loss": 0.12789982557296753, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "distill_loss": 0.10568737983703613, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "ref_ce_loss": 0.09125398099422455, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "loss": 0.38200074434280396, + "step": 12900 + }, + { + "ce_loss": 0.130323588848114, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "distill_loss": 0.11945810168981552, + "epoch": 4.302868579052702, + "step": 12900 + }, + { + "epoch": 4.302868579052702, + "ref_ce_loss": 0.07663031667470932, + "step": 12900 + }, + { + "epoch": 4.306204136090727, + "loss": 0.3974, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "grad_norm": 2.2518882751464844, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "learning_rate": 0.00013870776590959693, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "loss": 0.3219544291496277, + "step": 12910 + }, + { + "ce_loss": 0.1147737130522728, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "distill_loss": 0.1198507696390152, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "ref_ce_loss": 0.08728796243667603, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "loss": 0.40332797169685364, + "step": 12910 + }, + { + "ce_loss": 0.103706955909729, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "distill_loss": 0.10040242224931717, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "ref_ce_loss": 0.07252658158540726, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "loss": 0.3360239863395691, + "step": 12910 + }, + { + "ce_loss": 0.1121392697095871, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "distill_loss": 0.11188937723636627, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "ref_ce_loss": 0.08032336086034775, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "loss": 0.27747413516044617, + "step": 12910 + }, + { + "ce_loss": 0.06619500368833542, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "distill_loss": 0.10412861406803131, + "epoch": 4.306204136090727, + "step": 12910 + }, + { + "epoch": 4.306204136090727, + "ref_ce_loss": 0.08311747759580612, + "step": 12910 + }, + { + "epoch": 4.309539693128753, + "loss": 0.3994, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "grad_norm": 2.2300546169281006, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "learning_rate": 0.00013850578964783454, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "loss": 0.4618908762931824, + "step": 12920 + }, + { + "ce_loss": 0.14769093692302704, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "distill_loss": 0.12974810600280762, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "ref_ce_loss": 0.08218572288751602, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "loss": 0.36354151368141174, + "step": 12920 + }, + { + "ce_loss": 0.10538803040981293, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "distill_loss": 0.12680771946907043, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "ref_ce_loss": 0.09596758335828781, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "loss": 0.3615802526473999, + "step": 12920 + }, + { + "ce_loss": 0.14971128106117249, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "distill_loss": 0.12290134280920029, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "ref_ce_loss": 0.07071920484304428, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "loss": 0.1990204006433487, + "step": 12920 + }, + { + "ce_loss": 0.026068618521094322, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "distill_loss": 0.1140662431716919, + "epoch": 4.309539693128753, + "step": 12920 + }, + { + "epoch": 4.309539693128753, + "ref_ce_loss": 0.05882774293422699, + "step": 12920 + }, + { + "epoch": 4.312875250166778, + "loss": 0.4269, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "grad_norm": 2.498473882675171, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "learning_rate": 0.00013830383434697765, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "loss": 0.9417632818222046, + "step": 12930 + }, + { + "ce_loss": 0.07766778767108917, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "distill_loss": 0.12822362780570984, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "ref_ce_loss": 0.10774123668670654, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "loss": 0.3450748920440674, + "step": 12930 + }, + { + "ce_loss": 0.07935471087694168, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "distill_loss": 0.10800100117921829, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "ref_ce_loss": 0.08134301751852036, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "loss": 0.4334663152694702, + "step": 12930 + }, + { + "ce_loss": 0.15563234686851501, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "distill_loss": 0.1411948800086975, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "ref_ce_loss": 0.10971379280090332, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "loss": 0.8446991443634033, + "step": 12930 + }, + { + "ce_loss": 0.09985993802547455, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "distill_loss": 0.20070119202136993, + "epoch": 4.312875250166778, + "step": 12930 + }, + { + "epoch": 4.312875250166778, + "ref_ce_loss": 0.11902379244565964, + "step": 12930 + }, + { + "epoch": 4.316210807204803, + "loss": 0.415, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "grad_norm": 3.0529587268829346, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "learning_rate": 0.00013810190037531314, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "loss": 0.33493930101394653, + "step": 12940 + }, + { + "ce_loss": 0.10959524661302567, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "distill_loss": 0.13235542178153992, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "ref_ce_loss": 0.09294150024652481, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "loss": 0.43003711104393005, + "step": 12940 + }, + { + "ce_loss": 0.1794055849313736, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "distill_loss": 0.11357323825359344, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "ref_ce_loss": 0.13700471818447113, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "loss": 0.306405633687973, + "step": 12940 + }, + { + "ce_loss": 0.09713051468133926, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "distill_loss": 0.11166144907474518, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "ref_ce_loss": 0.05975410342216492, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "loss": 0.4291016161441803, + "step": 12940 + }, + { + "ce_loss": 0.09967274218797684, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "distill_loss": 0.14119568467140198, + "epoch": 4.316210807204803, + "step": 12940 + }, + { + "epoch": 4.316210807204803, + "ref_ce_loss": 0.12897689640522003, + "step": 12940 + }, + { + "epoch": 4.319546364242829, + "loss": 0.4561, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "grad_norm": 4.655219078063965, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "learning_rate": 0.00013789998810108904, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "loss": 0.34342288970947266, + "step": 12950 + }, + { + "ce_loss": 0.046129222959280014, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "distill_loss": 0.10202468931674957, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "ref_ce_loss": 0.071707583963871, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "loss": 0.5038535594940186, + "step": 12950 + }, + { + "ce_loss": 0.11776093393564224, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "distill_loss": 0.10977289825677872, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "ref_ce_loss": 0.09411168843507767, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "loss": 0.3241851329803467, + "step": 12950 + }, + { + "ce_loss": 0.08705181628465652, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "distill_loss": 0.11426226794719696, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "ref_ce_loss": 0.08718805015087128, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "loss": 0.316148966550827, + "step": 12950 + }, + { + "ce_loss": 0.0800526961684227, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "distill_loss": 0.11344519257545471, + "epoch": 4.319546364242829, + "step": 12950 + }, + { + "epoch": 4.319546364242829, + "ref_ce_loss": 0.0967073068022728, + "step": 12950 + }, + { + "epoch": 4.322881921280854, + "loss": 0.4104, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "grad_norm": 2.311840057373047, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "learning_rate": 0.00013769809789251347, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "loss": 0.7348376512527466, + "step": 12960 + }, + { + "ce_loss": 0.08843173086643219, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "distill_loss": 0.09694448858499527, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "ref_ce_loss": 0.08761128038167953, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "loss": 0.36715108156204224, + "step": 12960 + }, + { + "ce_loss": 0.11807632446289062, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "distill_loss": 0.13147501647472382, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "ref_ce_loss": 0.09746716916561127, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "loss": 0.4032805562019348, + "step": 12960 + }, + { + "ce_loss": 0.10035624355077744, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "distill_loss": 0.16447895765304565, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "ref_ce_loss": 0.11302655190229416, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "loss": 0.5197752714157104, + "step": 12960 + }, + { + "ce_loss": 0.10778123140335083, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "distill_loss": 0.1056332215666771, + "epoch": 4.322881921280854, + "step": 12960 + }, + { + "epoch": 4.322881921280854, + "ref_ce_loss": 0.060031503438949585, + "step": 12960 + }, + { + "epoch": 4.326217478318879, + "loss": 0.4357, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "grad_norm": 2.313013792037964, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "learning_rate": 0.00013749623011775463, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "loss": 0.34826603531837463, + "step": 12970 + }, + { + "ce_loss": 0.07239992171525955, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "distill_loss": 0.15117846429347992, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "ref_ce_loss": 0.07360713928937912, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "loss": 0.35273313522338867, + "step": 12970 + }, + { + "ce_loss": 0.057199910283088684, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "distill_loss": 0.13013851642608643, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "ref_ce_loss": 0.12024141103029251, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "loss": 0.5760383605957031, + "step": 12970 + }, + { + "ce_loss": 0.1058734655380249, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "distill_loss": 0.09850041568279266, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "ref_ce_loss": 0.12359046190977097, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "loss": 0.32047203183174133, + "step": 12970 + }, + { + "ce_loss": 0.07591608166694641, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "distill_loss": 0.1338208019733429, + "epoch": 4.326217478318879, + "step": 12970 + }, + { + "epoch": 4.326217478318879, + "ref_ce_loss": 0.07232265919446945, + "step": 12970 + }, + { + "epoch": 4.329553035356905, + "loss": 0.4049, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "grad_norm": 4.8108649253845215, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "learning_rate": 0.00013729438514493983, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "loss": 0.7760747671127319, + "step": 12980 + }, + { + "ce_loss": 0.1079782098531723, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "distill_loss": 0.17459788918495178, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "ref_ce_loss": 0.08016688376665115, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "loss": 0.4666213393211365, + "step": 12980 + }, + { + "ce_loss": 0.09899456799030304, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "distill_loss": 0.18947945535182953, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "ref_ce_loss": 0.1227281391620636, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "loss": 0.4355705976486206, + "step": 12980 + }, + { + "ce_loss": 0.11216352880001068, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "distill_loss": 0.13032612204551697, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "ref_ce_loss": 0.10035806149244308, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "loss": 0.4230186343193054, + "step": 12980 + }, + { + "ce_loss": 0.04993223026394844, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "distill_loss": 0.2290044128894806, + "epoch": 4.329553035356905, + "step": 12980 + }, + { + "epoch": 4.329553035356905, + "ref_ce_loss": 0.09435595571994781, + "step": 12980 + }, + { + "epoch": 4.33288859239493, + "loss": 0.4533, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "grad_norm": 4.167677879333496, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "learning_rate": 0.00013709256334215445, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "loss": 0.9822889566421509, + "step": 12990 + }, + { + "ce_loss": 0.14912807941436768, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "distill_loss": 0.15161332488059998, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "ref_ce_loss": 0.08670554310083389, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "loss": 0.38593828678131104, + "step": 12990 + }, + { + "ce_loss": 0.08408203721046448, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "distill_loss": 0.15504911541938782, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "ref_ce_loss": 0.08915705233812332, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "loss": 0.3300465941429138, + "step": 12990 + }, + { + "ce_loss": 0.05601300671696663, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "distill_loss": 0.0853685736656189, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "ref_ce_loss": 0.06712325662374496, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "loss": 0.3273487687110901, + "step": 12990 + }, + { + "ce_loss": 0.07666195929050446, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "distill_loss": 0.13456887006759644, + "epoch": 4.33288859239493, + "step": 12990 + }, + { + "epoch": 4.33288859239493, + "ref_ce_loss": 0.08263428509235382, + "step": 12990 + }, + { + "epoch": 4.336224149432955, + "loss": 0.4599, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "grad_norm": 3.2254204750061035, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "learning_rate": 0.00013689076507744207, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "loss": 0.27640801668167114, + "step": 13000 + }, + { + "ce_loss": 0.06200258433818817, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "distill_loss": 0.11888018995523453, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "ref_ce_loss": 0.07222095876932144, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "loss": 0.2827991843223572, + "step": 13000 + }, + { + "ce_loss": 0.07215210050344467, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "distill_loss": 0.133390411734581, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "ref_ce_loss": 0.0737648531794548, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "loss": 0.3525514304637909, + "step": 13000 + }, + { + "ce_loss": 0.08922962844371796, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "distill_loss": 0.11442573368549347, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "ref_ce_loss": 0.0543455071747303, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "loss": 0.27908873558044434, + "step": 13000 + }, + { + "ce_loss": 0.05551055073738098, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "distill_loss": 0.11368223279714584, + "epoch": 4.336224149432955, + "step": 13000 + }, + { + "epoch": 4.336224149432955, + "ref_ce_loss": 0.10890942811965942, + "step": 13000 + }, + { + "epoch": 4.339559706470981, + "loss": 0.3712, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "grad_norm": 3.2621657848358154, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "learning_rate": 0.00013668899071880297, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "loss": 0.27522847056388855, + "step": 13010 + }, + { + "ce_loss": 0.07751738280057907, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "distill_loss": 0.13736335933208466, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "ref_ce_loss": 0.06012954190373421, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "loss": 0.7454414367675781, + "step": 13010 + }, + { + "ce_loss": 0.12859775125980377, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "distill_loss": 0.17422287166118622, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "ref_ce_loss": 0.12988649308681488, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "loss": 0.4908011555671692, + "step": 13010 + }, + { + "ce_loss": 0.15541724860668182, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "distill_loss": 0.15849357843399048, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "ref_ce_loss": 0.10041403025388718, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "loss": 0.3782275319099426, + "step": 13010 + }, + { + "ce_loss": 0.11592970043420792, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "distill_loss": 0.19228774309158325, + "epoch": 4.339559706470981, + "step": 13010 + }, + { + "epoch": 4.339559706470981, + "ref_ce_loss": 0.06995546817779541, + "step": 13010 + }, + { + "epoch": 4.342895263509006, + "loss": 0.4515, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "grad_norm": 2.6184566020965576, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "learning_rate": 0.000136487240634194, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "loss": 0.453509122133255, + "step": 13020 + }, + { + "ce_loss": 0.10892651230096817, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "distill_loss": 0.2049221247434616, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "ref_ce_loss": 0.1100277379155159, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "loss": 0.42655685544013977, + "step": 13020 + }, + { + "ce_loss": 0.06593813747167587, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "distill_loss": 0.18975017964839935, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "ref_ce_loss": 0.10121740400791168, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "loss": 0.3206443786621094, + "step": 13020 + }, + { + "ce_loss": 0.11419174075126648, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "distill_loss": 0.13377337157726288, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "ref_ce_loss": 0.056987863034009933, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "loss": 0.34542882442474365, + "step": 13020 + }, + { + "ce_loss": 0.11716454476118088, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "distill_loss": 0.11346367001533508, + "epoch": 4.342895263509006, + "step": 13020 + }, + { + "epoch": 4.342895263509006, + "ref_ce_loss": 0.11472050100564957, + "step": 13020 + }, + { + "epoch": 4.3462308205470315, + "loss": 0.4454, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "grad_norm": 2.3557121753692627, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "learning_rate": 0.00013628551519152783, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "loss": 0.3510717451572418, + "step": 13030 + }, + { + "ce_loss": 0.07808232307434082, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "distill_loss": 0.11191411316394806, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "ref_ce_loss": 0.0939103364944458, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "loss": 0.39723458886146545, + "step": 13030 + }, + { + "ce_loss": 0.14701463282108307, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "distill_loss": 0.1371280550956726, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "ref_ce_loss": 0.0885378047823906, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "loss": 0.5219001770019531, + "step": 13030 + }, + { + "ce_loss": 0.20793606340885162, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "distill_loss": 0.2547592520713806, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "ref_ce_loss": 0.0590997040271759, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "loss": 0.2735883593559265, + "step": 13030 + }, + { + "ce_loss": 0.03217329457402229, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "distill_loss": 0.07507725805044174, + "epoch": 4.3462308205470315, + "step": 13030 + }, + { + "epoch": 4.3462308205470315, + "ref_ce_loss": 0.06571685522794724, + "step": 13030 + }, + { + "epoch": 4.349566377585057, + "loss": 0.4346, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "grad_norm": 1.9382163286209106, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "learning_rate": 0.000136083814758672, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "loss": 0.34804975986480713, + "step": 13040 + }, + { + "ce_loss": 0.11048239469528198, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "distill_loss": 0.17029768228530884, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "ref_ce_loss": 0.0672643780708313, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "loss": 0.3496224284172058, + "step": 13040 + }, + { + "ce_loss": 0.05408291146159172, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "distill_loss": 0.18122805655002594, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "ref_ce_loss": 0.08082221448421478, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "loss": 0.7453316450119019, + "step": 13040 + }, + { + "ce_loss": 0.14862661063671112, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "distill_loss": 0.24175487458705902, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "ref_ce_loss": 0.09618813544511795, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "loss": 0.46699488162994385, + "step": 13040 + }, + { + "ce_loss": 0.14248669147491455, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "distill_loss": 0.15172013640403748, + "epoch": 4.349566377585057, + "step": 13040 + }, + { + "epoch": 4.349566377585057, + "ref_ce_loss": 0.11561673134565353, + "step": 13040 + }, + { + "epoch": 4.352901934623082, + "loss": 0.4355, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "grad_norm": 2.936624050140381, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "learning_rate": 0.00013588213970344855, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "loss": 0.44997403025627136, + "step": 13050 + }, + { + "ce_loss": 0.13845403492450714, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "distill_loss": 0.1162261962890625, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "ref_ce_loss": 0.10550343990325928, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "loss": 0.6536597609519958, + "step": 13050 + }, + { + "ce_loss": 0.1402038037776947, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "distill_loss": 0.18752452731132507, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "ref_ce_loss": 0.08539095520973206, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "loss": 0.35335773229599, + "step": 13050 + }, + { + "ce_loss": 0.09392538666725159, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "distill_loss": 0.13803143799304962, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "ref_ce_loss": 0.09819263964891434, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "loss": 0.29352426528930664, + "step": 13050 + }, + { + "ce_loss": 0.05404730513691902, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "distill_loss": 0.1509397327899933, + "epoch": 4.352901934623082, + "step": 13050 + }, + { + "epoch": 4.352901934623082, + "ref_ce_loss": 0.08849099278450012, + "step": 13050 + }, + { + "epoch": 4.3562374916611075, + "loss": 0.401, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "grad_norm": 2.245464324951172, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "learning_rate": 0.00013568049039363326, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "loss": 0.34581252932548523, + "step": 13060 + }, + { + "ce_loss": 0.07167209684848785, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "distill_loss": 0.13952210545539856, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "ref_ce_loss": 0.06178533658385277, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "loss": 0.3579484522342682, + "step": 13060 + }, + { + "ce_loss": 0.07104761153459549, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "distill_loss": 0.19464844465255737, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "ref_ce_loss": 0.06834821403026581, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "loss": 0.47544968128204346, + "step": 13060 + }, + { + "ce_loss": 0.1545322984457016, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "distill_loss": 0.16924047470092773, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "ref_ce_loss": 0.09331899136304855, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "loss": 0.27666276693344116, + "step": 13060 + }, + { + "ce_loss": 0.099562868475914, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "distill_loss": 0.11031017452478409, + "epoch": 4.3562374916611075, + "step": 13060 + }, + { + "epoch": 4.3562374916611075, + "ref_ce_loss": 0.06676922738552094, + "step": 13060 + }, + { + "epoch": 4.359573048699133, + "loss": 0.4247, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "grad_norm": 2.441117286682129, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "learning_rate": 0.00013547886719695486, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "loss": 0.4055763781070709, + "step": 13070 + }, + { + "ce_loss": 0.1165999099612236, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "distill_loss": 0.13579559326171875, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "ref_ce_loss": 0.10974972695112228, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "loss": 0.31318339705467224, + "step": 13070 + }, + { + "ce_loss": 0.10058171302080154, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "distill_loss": 0.09889546036720276, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "ref_ce_loss": 0.06917301565408707, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "loss": 0.3399220108985901, + "step": 13070 + }, + { + "ce_loss": 0.07428082078695297, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "distill_loss": 0.17034077644348145, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "ref_ce_loss": 0.0660679042339325, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "loss": 0.9781274795532227, + "step": 13070 + }, + { + "ce_loss": 0.14656391739845276, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "distill_loss": 0.12748153507709503, + "epoch": 4.359573048699133, + "step": 13070 + }, + { + "epoch": 4.359573048699133, + "ref_ce_loss": 0.09130503237247467, + "step": 13070 + }, + { + "epoch": 4.362908605737158, + "loss": 0.4427, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "grad_norm": 3.1027774810791016, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "learning_rate": 0.00013527727048109463, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "loss": 0.4412672817707062, + "step": 13080 + }, + { + "ce_loss": 0.1664845198392868, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "distill_loss": 0.1434018909931183, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "ref_ce_loss": 0.1312694400548935, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "loss": 0.3231005072593689, + "step": 13080 + }, + { + "ce_loss": 0.10845252871513367, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "distill_loss": 0.140834778547287, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "ref_ce_loss": 0.07371904700994492, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "loss": 0.3200206756591797, + "step": 13080 + }, + { + "ce_loss": 0.06258603185415268, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "distill_loss": 0.13723817467689514, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "ref_ce_loss": 0.06365492939949036, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "loss": 0.23603607714176178, + "step": 13080 + }, + { + "ce_loss": 0.05964049696922302, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "distill_loss": 0.11777057498693466, + "epoch": 4.362908605737158, + "step": 13080 + }, + { + "epoch": 4.362908605737158, + "ref_ce_loss": 0.057929519563913345, + "step": 13080 + }, + { + "epoch": 4.366244162775184, + "loss": 0.417, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "grad_norm": 2.3897628784179688, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "learning_rate": 0.00013507570061368536, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "loss": 0.5743356943130493, + "step": 13090 + }, + { + "ce_loss": 0.15212753415107727, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "distill_loss": 0.19665196537971497, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "ref_ce_loss": 0.11735095083713531, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "loss": 0.9558845162391663, + "step": 13090 + }, + { + "ce_loss": 0.12546709179878235, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "distill_loss": 0.17301815748214722, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "ref_ce_loss": 0.08531199395656586, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "loss": 0.4186132550239563, + "step": 13090 + }, + { + "ce_loss": 0.1030435711145401, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "distill_loss": 0.15109947323799133, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "ref_ce_loss": 0.09883557260036469, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "loss": 0.49957624077796936, + "step": 13090 + }, + { + "ce_loss": 0.12739510834217072, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "distill_loss": 0.22322194278240204, + "epoch": 4.366244162775184, + "step": 13090 + }, + { + "epoch": 4.366244162775184, + "ref_ce_loss": 0.1483670175075531, + "step": 13090 + }, + { + "epoch": 4.369579719813209, + "loss": 0.424, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "grad_norm": 2.3537068367004395, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "learning_rate": 0.00013487415796231103, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "loss": 0.382670521736145, + "step": 13100 + }, + { + "ce_loss": 0.11593491584062576, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "distill_loss": 0.15580274164676666, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "ref_ce_loss": 0.1107337549328804, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "loss": 0.6211612820625305, + "step": 13100 + }, + { + "ce_loss": 0.23382166028022766, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "distill_loss": 0.20548737049102783, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "ref_ce_loss": 0.13101726770401, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "loss": 0.34104016423225403, + "step": 13100 + }, + { + "ce_loss": 0.08137911558151245, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "distill_loss": 0.12536190450191498, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "ref_ce_loss": 0.1014823243021965, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "loss": 0.31775692105293274, + "step": 13100 + }, + { + "ce_loss": 0.08146476745605469, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "distill_loss": 0.11398107558488846, + "epoch": 4.369579719813209, + "step": 13100 + }, + { + "epoch": 4.369579719813209, + "ref_ce_loss": 0.07950994372367859, + "step": 13100 + }, + { + "epoch": 4.372915276851234, + "loss": 0.4287, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "grad_norm": 1.9185400009155273, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "learning_rate": 0.00013467264289450593, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "loss": 0.3162509500980377, + "step": 13110 + }, + { + "ce_loss": 0.1067291647195816, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "distill_loss": 0.13923035562038422, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "ref_ce_loss": 0.07007686793804169, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "loss": 0.3750307857990265, + "step": 13110 + }, + { + "ce_loss": 0.13682420551776886, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "distill_loss": 0.15495645999908447, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "ref_ce_loss": 0.08312554657459259, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "loss": 0.631348192691803, + "step": 13110 + }, + { + "ce_loss": 0.13241538405418396, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "distill_loss": 0.1382109820842743, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "ref_ce_loss": 0.09631854295730591, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "loss": 0.3689813017845154, + "step": 13110 + }, + { + "ce_loss": 0.09740588068962097, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "distill_loss": 0.13242462277412415, + "epoch": 4.372915276851234, + "step": 13110 + }, + { + "epoch": 4.372915276851234, + "ref_ce_loss": 0.0812930315732956, + "step": 13110 + }, + { + "epoch": 4.37625083388926, + "loss": 0.4738, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "grad_norm": 2.93460750579834, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "learning_rate": 0.00013447115577775403, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "loss": 0.2833386957645416, + "step": 13120 + }, + { + "ce_loss": 0.03885704651474953, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "distill_loss": 0.12924322485923767, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "ref_ce_loss": 0.07931698113679886, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "loss": 0.2440667301416397, + "step": 13120 + }, + { + "ce_loss": 0.048960719257593155, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "distill_loss": 0.09508942067623138, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "ref_ce_loss": 0.07707788050174713, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "loss": 0.6507571935653687, + "step": 13120 + }, + { + "ce_loss": 0.08221402764320374, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "distill_loss": 0.11556510627269745, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "ref_ce_loss": 0.08824213594198227, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "loss": 0.3309069275856018, + "step": 13120 + }, + { + "ce_loss": 0.07076641917228699, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "distill_loss": 0.15475329756736755, + "epoch": 4.37625083388926, + "step": 13120 + }, + { + "epoch": 4.37625083388926, + "ref_ce_loss": 0.10515636205673218, + "step": 13120 + }, + { + "epoch": 4.379586390927285, + "loss": 0.4077, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "grad_norm": 2.9743335247039795, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "learning_rate": 0.00013426969697948838, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "loss": 0.30378925800323486, + "step": 13130 + }, + { + "ce_loss": 0.061307426542043686, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "distill_loss": 0.10912270098924637, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "ref_ce_loss": 0.05688157305121422, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "loss": 0.4710403382778168, + "step": 13130 + }, + { + "ce_loss": 0.06571760028600693, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "distill_loss": 0.210685133934021, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "ref_ce_loss": 0.060083936899900436, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "loss": 0.4687569737434387, + "step": 13130 + }, + { + "ce_loss": 0.1300654113292694, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "distill_loss": 0.21737553179264069, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "ref_ce_loss": 0.12073502689599991, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "loss": 0.5804852247238159, + "step": 13130 + }, + { + "ce_loss": 0.05941832438111305, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "distill_loss": 0.13962823152542114, + "epoch": 4.379586390927285, + "step": 13130 + }, + { + "epoch": 4.379586390927285, + "ref_ce_loss": 0.08975160866975784, + "step": 13130 + }, + { + "epoch": 4.38292194796531, + "loss": 0.4298, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "grad_norm": 2.263075113296509, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "learning_rate": 0.00013406826686709032, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "loss": 0.39446863532066345, + "step": 13140 + }, + { + "ce_loss": 0.08332235366106033, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "distill_loss": 0.1692182719707489, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "ref_ce_loss": 0.09426987171173096, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "loss": 0.8630024194717407, + "step": 13140 + }, + { + "ce_loss": 0.10032361000776291, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "distill_loss": 0.16708731651306152, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "ref_ce_loss": 0.12849758565425873, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "loss": 0.4019727408885956, + "step": 13140 + }, + { + "ce_loss": 0.14358335733413696, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "distill_loss": 0.1606614589691162, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "ref_ce_loss": 0.06312292814254761, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "loss": 0.3633531630039215, + "step": 13140 + }, + { + "ce_loss": 0.09154992550611496, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "distill_loss": 0.15102434158325195, + "epoch": 4.38292194796531, + "step": 13140 + }, + { + "epoch": 4.38292194796531, + "ref_ce_loss": 0.09260568767786026, + "step": 13140 + }, + { + "epoch": 4.386257505003336, + "loss": 0.4414, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "grad_norm": 4.649960517883301, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "learning_rate": 0.00013386686580788893, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "loss": 0.3555895686149597, + "step": 13150 + }, + { + "ce_loss": 0.11861802637577057, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "distill_loss": 0.13332310318946838, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "ref_ce_loss": 0.08811759948730469, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "loss": 0.6896891593933105, + "step": 13150 + }, + { + "ce_loss": 0.14115719497203827, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "distill_loss": 0.15127448737621307, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "ref_ce_loss": 0.10953246802091599, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "loss": 0.5004947185516357, + "step": 13150 + }, + { + "ce_loss": 0.11395592987537384, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "distill_loss": 0.17832502722740173, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "ref_ce_loss": 0.12350393831729889, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "loss": 0.45451653003692627, + "step": 13150 + }, + { + "ce_loss": 0.17620813846588135, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "distill_loss": 0.18706226348876953, + "epoch": 4.386257505003336, + "step": 13150 + }, + { + "epoch": 4.386257505003336, + "ref_ce_loss": 0.06283406913280487, + "step": 13150 + }, + { + "epoch": 4.389593062041361, + "loss": 0.4374, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "grad_norm": 3.5954205989837646, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "learning_rate": 0.00013366549416916033, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "loss": 0.7484768629074097, + "step": 13160 + }, + { + "ce_loss": 0.06970355659723282, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "distill_loss": 0.13633976876735687, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "ref_ce_loss": 0.11723722517490387, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "loss": 0.25856563448905945, + "step": 13160 + }, + { + "ce_loss": 0.05419513210654259, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "distill_loss": 0.12057643383741379, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "ref_ce_loss": 0.083645299077034, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "loss": 0.33344385027885437, + "step": 13160 + }, + { + "ce_loss": 0.10062061250209808, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "distill_loss": 0.13595770299434662, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "ref_ce_loss": 0.0702580064535141, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "loss": 0.44337448477745056, + "step": 13160 + }, + { + "ce_loss": 0.1366061568260193, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "distill_loss": 0.19141413271427155, + "epoch": 4.389593062041361, + "step": 13160 + }, + { + "epoch": 4.389593062041361, + "ref_ce_loss": 0.11482523381710052, + "step": 13160 + }, + { + "epoch": 4.392928619079386, + "loss": 0.4569, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "grad_norm": 4.291663646697998, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "learning_rate": 0.0001334641523181269, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "loss": 0.3736570477485657, + "step": 13170 + }, + { + "ce_loss": 0.12922805547714233, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "distill_loss": 0.1475725919008255, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "ref_ce_loss": 0.09670910239219666, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "loss": 0.43183663487434387, + "step": 13170 + }, + { + "ce_loss": 0.13625575602054596, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "distill_loss": 0.1850631982088089, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "ref_ce_loss": 0.1103607639670372, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "loss": 0.41522231698036194, + "step": 13170 + }, + { + "ce_loss": 0.08343395590782166, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "distill_loss": 0.17501504719257355, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "ref_ce_loss": 0.1140434592962265, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "loss": 0.42343854904174805, + "step": 13170 + }, + { + "ce_loss": 0.14579157531261444, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "distill_loss": 0.13567650318145752, + "epoch": 4.392928619079386, + "step": 13170 + }, + { + "epoch": 4.392928619079386, + "ref_ce_loss": 0.09917942434549332, + "step": 13170 + }, + { + "epoch": 4.396264176117412, + "loss": 0.4089, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "grad_norm": 7.946816921234131, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "learning_rate": 0.00013326284062195682, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "loss": 0.2218421846628189, + "step": 13180 + }, + { + "ce_loss": 0.054962776601314545, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "distill_loss": 0.11377930641174316, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "ref_ce_loss": 0.052917636930942535, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "loss": 0.3381150960922241, + "step": 13180 + }, + { + "ce_loss": 0.03516875579953194, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "distill_loss": 0.18441802263259888, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "ref_ce_loss": 0.07337495684623718, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "loss": 0.427213579416275, + "step": 13180 + }, + { + "ce_loss": 0.13373126089572906, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "distill_loss": 0.16650520265102386, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "ref_ce_loss": 0.07328952848911285, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "loss": 0.8363127708435059, + "step": 13180 + }, + { + "ce_loss": 0.07693137973546982, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "distill_loss": 0.17669403553009033, + "epoch": 4.396264176117412, + "step": 13180 + }, + { + "epoch": 4.396264176117412, + "ref_ce_loss": 0.08033239841461182, + "step": 13180 + }, + { + "epoch": 4.399599733155437, + "loss": 0.445, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "grad_norm": 2.264517068862915, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "learning_rate": 0.00013306155944776315, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "loss": 0.45710739493370056, + "step": 13190 + }, + { + "ce_loss": 0.1478605568408966, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "distill_loss": 0.13630427420139313, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "ref_ce_loss": 0.0982251912355423, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "loss": 0.4475752115249634, + "step": 13190 + }, + { + "ce_loss": 0.12199734896421432, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "distill_loss": 0.15550018846988678, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "ref_ce_loss": 0.10780463367700577, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "loss": 0.34469008445739746, + "step": 13190 + }, + { + "ce_loss": 0.1141442283987999, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "distill_loss": 0.13208875060081482, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "ref_ce_loss": 0.06683417409658432, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "loss": 0.37320512533187866, + "step": 13190 + }, + { + "ce_loss": 0.12317366153001785, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "distill_loss": 0.19788286089897156, + "epoch": 4.399599733155437, + "step": 13190 + }, + { + "epoch": 4.399599733155437, + "ref_ce_loss": 0.051998071372509, + "step": 13190 + }, + { + "epoch": 4.402935290193462, + "loss": 0.4291, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "grad_norm": 2.0251078605651855, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "learning_rate": 0.00013286030916260337, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "loss": 0.4113975167274475, + "step": 13200 + }, + { + "ce_loss": 0.10742149502038956, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "distill_loss": 0.14522676169872284, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "ref_ce_loss": 0.10619600117206573, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "loss": 0.3976299464702606, + "step": 13200 + }, + { + "ce_loss": 0.1575118750333786, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "distill_loss": 0.10850942879915237, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "ref_ce_loss": 0.0715140551328659, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "loss": 0.5364969968795776, + "step": 13200 + }, + { + "ce_loss": 0.1614045798778534, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "distill_loss": 0.23162581026554108, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "ref_ce_loss": 0.10570058971643448, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "loss": 0.3077254891395569, + "step": 13200 + }, + { + "ce_loss": 0.09167087078094482, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "distill_loss": 0.1461600661277771, + "epoch": 4.402935290193462, + "step": 13200 + }, + { + "epoch": 4.402935290193462, + "ref_ce_loss": 0.0452951155602932, + "step": 13200 + }, + { + "epoch": 4.406270847231488, + "loss": 0.4201, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "grad_norm": 3.6564624309539795, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "learning_rate": 0.00013265909013347865, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "loss": 0.6841806769371033, + "step": 13210 + }, + { + "ce_loss": 0.10823642462491989, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "distill_loss": 0.18694089353084564, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "ref_ce_loss": 0.0933864563703537, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "loss": 0.3473528325557709, + "step": 13210 + }, + { + "ce_loss": 0.10555067658424377, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "distill_loss": 0.13975630700588226, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "ref_ce_loss": 0.050673503428697586, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "loss": 0.6095040440559387, + "step": 13210 + }, + { + "ce_loss": 0.15313434600830078, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "distill_loss": 0.1529056876897812, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "ref_ce_loss": 0.07480410486459732, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "loss": 0.48279547691345215, + "step": 13210 + }, + { + "ce_loss": 0.1067587211728096, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "distill_loss": 0.18704378604888916, + "epoch": 4.406270847231488, + "step": 13210 + }, + { + "epoch": 4.406270847231488, + "ref_ce_loss": 0.09729069471359253, + "step": 13210 + }, + { + "epoch": 4.409606404269513, + "loss": 0.4266, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "grad_norm": 2.6186203956604004, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "learning_rate": 0.00013245790272733307, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "loss": 0.3716428279876709, + "step": 13220 + }, + { + "ce_loss": 0.07271506637334824, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "distill_loss": 0.14049409329891205, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "ref_ce_loss": 0.0727582648396492, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "loss": 0.3323671519756317, + "step": 13220 + }, + { + "ce_loss": 0.08981771767139435, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "distill_loss": 0.14637064933776855, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "ref_ce_loss": 0.06951261311769485, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "loss": 0.38633036613464355, + "step": 13220 + }, + { + "ce_loss": 0.10376311838626862, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "distill_loss": 0.1843695342540741, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "ref_ce_loss": 0.068430595099926, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "loss": 0.3019222319126129, + "step": 13220 + }, + { + "ce_loss": 0.015285542234778404, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "distill_loss": 0.09861694276332855, + "epoch": 4.409606404269513, + "step": 13220 + }, + { + "epoch": 4.409606404269513, + "ref_ce_loss": 0.06827165186405182, + "step": 13220 + }, + { + "epoch": 4.4129419613075385, + "loss": 0.4154, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "grad_norm": 2.5071351528167725, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "learning_rate": 0.00013225674731105318, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "loss": 0.5132056474685669, + "step": 13230 + }, + { + "ce_loss": 0.17748892307281494, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "distill_loss": 0.16994822025299072, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "ref_ce_loss": 0.08495409041643143, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "loss": 0.34527939558029175, + "step": 13230 + }, + { + "ce_loss": 0.10870076715946198, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "distill_loss": 0.1736239790916443, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "ref_ce_loss": 0.0629189983010292, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "loss": 0.7003483176231384, + "step": 13230 + }, + { + "ce_loss": 0.24649113416671753, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "distill_loss": 0.21717987954616547, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "ref_ce_loss": 0.10224221646785736, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "loss": 0.4945812523365021, + "step": 13230 + }, + { + "ce_loss": 0.11560438573360443, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "distill_loss": 0.13503828644752502, + "epoch": 4.4129419613075385, + "step": 13230 + }, + { + "epoch": 4.4129419613075385, + "ref_ce_loss": 0.10286214202642441, + "step": 13230 + }, + { + "epoch": 4.416277518345564, + "loss": 0.4726, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "grad_norm": 4.089062690734863, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "learning_rate": 0.00013205562425146696, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "loss": 0.3657887578010559, + "step": 13240 + }, + { + "ce_loss": 0.08184022456407547, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "distill_loss": 0.1520671248435974, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "ref_ce_loss": 0.1031172052025795, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "loss": 0.29702356457710266, + "step": 13240 + }, + { + "ce_loss": 0.09236989170312881, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "distill_loss": 0.12858924269676208, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "ref_ce_loss": 0.048820894211530685, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "loss": 0.6049207448959351, + "step": 13240 + }, + { + "ce_loss": 0.07722629606723785, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "distill_loss": 0.10887705534696579, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "ref_ce_loss": 0.06840761005878448, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "loss": 0.37058472633361816, + "step": 13240 + }, + { + "ce_loss": 0.10921172052621841, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "distill_loss": 0.13606517016887665, + "epoch": 4.416277518345564, + "step": 13240 + }, + { + "epoch": 4.416277518345564, + "ref_ce_loss": 0.12518516182899475, + "step": 13240 + }, + { + "epoch": 4.419613075383589, + "loss": 0.4452, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "grad_norm": 2.7039458751678467, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "learning_rate": 0.00013185453391534365, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "loss": 0.45535069704055786, + "step": 13250 + }, + { + "ce_loss": 0.1358918398618698, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "distill_loss": 0.25594010949134827, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "ref_ce_loss": 0.06339307874441147, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "loss": 0.4872303009033203, + "step": 13250 + }, + { + "ce_loss": 0.11887913197278976, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "distill_loss": 0.20435214042663574, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "ref_ce_loss": 0.07590153813362122, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "loss": 0.286593496799469, + "step": 13250 + }, + { + "ce_loss": 0.09262045472860336, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "distill_loss": 0.13021138310432434, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "ref_ce_loss": 0.04283803701400757, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "loss": 0.3654177486896515, + "step": 13250 + }, + { + "ce_loss": 0.11240356415510178, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "distill_loss": 0.1893204152584076, + "epoch": 4.419613075383589, + "step": 13250 + }, + { + "epoch": 4.419613075383589, + "ref_ce_loss": 0.06355581432580948, + "step": 13250 + }, + { + "epoch": 4.4229486324216145, + "loss": 0.4019, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "grad_norm": 3.225231885910034, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "learning_rate": 0.00013165347666939275, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "loss": 0.5111039876937866, + "step": 13260 + }, + { + "ce_loss": 0.17488041520118713, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "distill_loss": 0.2363886833190918, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "ref_ce_loss": 0.09943842887878418, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "loss": 0.3635881245136261, + "step": 13260 + }, + { + "ce_loss": 0.12603534758090973, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "distill_loss": 0.13557103276252747, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "ref_ce_loss": 0.0772860124707222, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "loss": 0.366436630487442, + "step": 13260 + }, + { + "ce_loss": 0.034161556512117386, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "distill_loss": 0.14038299024105072, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "ref_ce_loss": 0.08084021508693695, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "loss": 0.3443140387535095, + "step": 13260 + }, + { + "ce_loss": 0.10151468962430954, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "distill_loss": 0.13597281277179718, + "epoch": 4.4229486324216145, + "step": 13260 + }, + { + "epoch": 4.4229486324216145, + "ref_ce_loss": 0.07605358213186264, + "step": 13260 + }, + { + "epoch": 4.42628418945964, + "loss": 0.4883, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "grad_norm": 2.648609161376953, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "learning_rate": 0.00013145245288026319, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "loss": 0.4274841547012329, + "step": 13270 + }, + { + "ce_loss": 0.05348929390311241, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "distill_loss": 0.1469852775335312, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "ref_ce_loss": 0.09373314678668976, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "loss": 0.9421871900558472, + "step": 13270 + }, + { + "ce_loss": 0.11481419950723648, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "distill_loss": 0.17199678719043732, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "ref_ce_loss": 0.13030129671096802, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "loss": 0.5865803956985474, + "step": 13270 + }, + { + "ce_loss": 0.15226690471172333, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "distill_loss": 0.20426751673221588, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "ref_ce_loss": 0.08978847414255142, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "loss": 0.41231030225753784, + "step": 13270 + }, + { + "ce_loss": 0.1027587354183197, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "distill_loss": 0.09782750904560089, + "epoch": 4.42628418945964, + "step": 13270 + }, + { + "epoch": 4.42628418945964, + "ref_ce_loss": 0.0866297110915184, + "step": 13270 + }, + { + "epoch": 4.429619746497665, + "loss": 0.4565, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "grad_norm": 2.3905386924743652, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "learning_rate": 0.0001312514629145432, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "loss": 0.3989601731300354, + "step": 13280 + }, + { + "ce_loss": 0.08613447844982147, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "distill_loss": 0.13968902826309204, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "ref_ce_loss": 0.08468887954950333, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "loss": 0.4234456419944763, + "step": 13280 + }, + { + "ce_loss": 0.11148975044488907, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "distill_loss": 0.15834802389144897, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "ref_ce_loss": 0.10697125643491745, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "loss": 0.32392358779907227, + "step": 13280 + }, + { + "ce_loss": 0.06255452334880829, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "distill_loss": 0.14786508679389954, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "ref_ce_loss": 0.07415209710597992, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "loss": 0.47612956166267395, + "step": 13280 + }, + { + "ce_loss": 0.16316884756088257, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "distill_loss": 0.1871202141046524, + "epoch": 4.429619746497665, + "step": 13280 + }, + { + "epoch": 4.429619746497665, + "ref_ce_loss": 0.10071472823619843, + "step": 13280 + }, + { + "epoch": 4.432955303535691, + "loss": 0.4265, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "grad_norm": 2.2215828895568848, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "learning_rate": 0.00013105050713875922, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "loss": 0.5471583604812622, + "step": 13290 + }, + { + "ce_loss": 0.11144928634166718, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "distill_loss": 0.16252776980400085, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "ref_ce_loss": 0.06755536794662476, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "loss": 0.2610267102718353, + "step": 13290 + }, + { + "ce_loss": 0.09459121525287628, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "distill_loss": 0.09221311658620834, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "ref_ce_loss": 0.07398366928100586, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "loss": 0.5348576307296753, + "step": 13290 + }, + { + "ce_loss": 0.0725354477763176, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "distill_loss": 0.13648444414138794, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "ref_ce_loss": 0.10237973183393478, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "loss": 0.33219125866889954, + "step": 13290 + }, + { + "ce_loss": 0.11223422735929489, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "distill_loss": 0.15365713834762573, + "epoch": 4.432955303535691, + "step": 13290 + }, + { + "epoch": 4.432955303535691, + "ref_ce_loss": 0.06554263085126877, + "step": 13290 + }, + { + "epoch": 4.436290860573716, + "loss": 0.4689, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "grad_norm": 1.8762089014053345, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "learning_rate": 0.00013084958591937519, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "loss": 0.3735318183898926, + "step": 13300 + }, + { + "ce_loss": 0.05010557547211647, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "distill_loss": 0.19283129274845123, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "ref_ce_loss": 0.07497648149728775, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "loss": 0.4607619643211365, + "step": 13300 + }, + { + "ce_loss": 0.1650754064321518, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "distill_loss": 0.13137763738632202, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "ref_ce_loss": 0.0980844497680664, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "loss": 0.36769622564315796, + "step": 13300 + }, + { + "ce_loss": 0.08664768934249878, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "distill_loss": 0.11689133197069168, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "ref_ce_loss": 0.0674903467297554, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "loss": 0.41448602080345154, + "step": 13300 + }, + { + "ce_loss": 0.118854820728302, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "distill_loss": 0.156342551112175, + "epoch": 4.436290860573716, + "step": 13300 + }, + { + "epoch": 4.436290860573716, + "ref_ce_loss": 0.08365678787231445, + "step": 13300 + }, + { + "epoch": 4.439626417611741, + "loss": 0.4327, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "grad_norm": 2.736565351486206, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "learning_rate": 0.00013064869962279226, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "loss": 0.47164031863212585, + "step": 13310 + }, + { + "ce_loss": 0.13483786582946777, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "distill_loss": 0.20212770998477936, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "ref_ce_loss": 0.09424245357513428, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "loss": 0.4842525124549866, + "step": 13310 + }, + { + "ce_loss": 0.1050151139497757, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "distill_loss": 0.12448824197053909, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "ref_ce_loss": 0.10238910466432571, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "loss": 0.3458286225795746, + "step": 13310 + }, + { + "ce_loss": 0.07503864914178848, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "distill_loss": 0.14710615575313568, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "ref_ce_loss": 0.08717601001262665, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "loss": 0.4406740367412567, + "step": 13310 + }, + { + "ce_loss": 0.11572980880737305, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "distill_loss": 0.1492346227169037, + "epoch": 4.439626417611741, + "step": 13310 + }, + { + "epoch": 4.439626417611741, + "ref_ce_loss": 0.07425940036773682, + "step": 13310 + }, + { + "epoch": 4.442961974649767, + "loss": 0.4801, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "grad_norm": 3.5770788192749023, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "learning_rate": 0.00013044784861534773, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "loss": 0.3639149069786072, + "step": 13320 + }, + { + "ce_loss": 0.06203765794634819, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "distill_loss": 0.13513824343681335, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "ref_ce_loss": 0.045562874525785446, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "loss": 0.4618871808052063, + "step": 13320 + }, + { + "ce_loss": 0.07080277055501938, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "distill_loss": 0.17618733644485474, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "ref_ce_loss": 0.08771321922540665, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "loss": 0.3985617756843567, + "step": 13320 + }, + { + "ce_loss": 0.11235746741294861, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "distill_loss": 0.14052695035934448, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "ref_ce_loss": 0.1089642271399498, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "loss": 0.3456037640571594, + "step": 13320 + }, + { + "ce_loss": 0.03526025265455246, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "distill_loss": 0.1443566381931305, + "epoch": 4.442961974649767, + "step": 13320 + }, + { + "epoch": 4.442961974649767, + "ref_ce_loss": 0.07900136709213257, + "step": 13320 + }, + { + "epoch": 4.446297531687792, + "loss": 0.3867, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "grad_norm": 2.4664969444274902, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "learning_rate": 0.0001302470332633146, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "loss": 0.6854321360588074, + "step": 13330 + }, + { + "ce_loss": 0.10115274041891098, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "distill_loss": 0.22096025943756104, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "ref_ce_loss": 0.10487420111894608, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "loss": 0.35559317469596863, + "step": 13330 + }, + { + "ce_loss": 0.07464788109064102, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "distill_loss": 0.15491950511932373, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "ref_ce_loss": 0.09533894807100296, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "loss": 0.6421335339546204, + "step": 13330 + }, + { + "ce_loss": 0.10751327127218246, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "distill_loss": 0.21741746366024017, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "ref_ce_loss": 0.09993693977594376, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "loss": 0.5558599829673767, + "step": 13330 + }, + { + "ce_loss": 0.11573997139930725, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "distill_loss": 0.1898089200258255, + "epoch": 4.446297531687792, + "step": 13330 + }, + { + "epoch": 4.446297531687792, + "ref_ce_loss": 0.1459214985370636, + "step": 13330 + }, + { + "epoch": 4.449633088725817, + "loss": 0.487, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "grad_norm": 3.9603376388549805, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "learning_rate": 0.00013004625393290097, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "loss": 0.3406130075454712, + "step": 13340 + }, + { + "ce_loss": 0.09222155809402466, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "distill_loss": 0.15992435812950134, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "ref_ce_loss": 0.055970415472984314, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "loss": 0.27611514925956726, + "step": 13340 + }, + { + "ce_loss": 0.048204366117715836, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "distill_loss": 0.10166335850954056, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "ref_ce_loss": 0.0979592576622963, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "loss": 0.38644808530807495, + "step": 13340 + }, + { + "ce_loss": 0.07723668962717056, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "distill_loss": 0.24085360765457153, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "ref_ce_loss": 0.06818073242902756, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "loss": 0.484552800655365, + "step": 13340 + }, + { + "ce_loss": 0.14864076673984528, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "distill_loss": 0.18420973420143127, + "epoch": 4.449633088725817, + "step": 13340 + }, + { + "epoch": 4.449633088725817, + "ref_ce_loss": 0.15143580734729767, + "step": 13340 + }, + { + "epoch": 4.452968645763843, + "loss": 0.4362, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "grad_norm": 3.3409905433654785, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "learning_rate": 0.000129845510990249, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "loss": 0.5213329792022705, + "step": 13350 + }, + { + "ce_loss": 0.17185841500759125, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "distill_loss": 0.19801545143127441, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "ref_ce_loss": 0.09971991181373596, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "loss": 0.49981921911239624, + "step": 13350 + }, + { + "ce_loss": 0.06869068741798401, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "distill_loss": 0.10246706008911133, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "ref_ce_loss": 0.05476094037294388, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "loss": 0.45160943269729614, + "step": 13350 + }, + { + "ce_loss": 0.12725037336349487, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "distill_loss": 0.2022404819726944, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "ref_ce_loss": 0.07778077572584152, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "loss": 0.3159412741661072, + "step": 13350 + }, + { + "ce_loss": 0.11092124879360199, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "distill_loss": 0.13240209221839905, + "epoch": 4.452968645763843, + "step": 13350 + }, + { + "epoch": 4.452968645763843, + "ref_ce_loss": 0.07244209945201874, + "step": 13350 + }, + { + "epoch": 4.456304202801868, + "loss": 0.4877, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "grad_norm": 2.414829730987549, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "learning_rate": 0.0001296448048014347, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "loss": 0.780661940574646, + "step": 13360 + }, + { + "ce_loss": 0.08376435935497284, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "distill_loss": 0.21675555408000946, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "ref_ce_loss": 0.1165047362446785, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "loss": 0.41870665550231934, + "step": 13360 + }, + { + "ce_loss": 0.08456767350435257, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "distill_loss": 0.10734406113624573, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "ref_ce_loss": 0.07087530195713043, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "loss": 0.2585833668708801, + "step": 13360 + }, + { + "ce_loss": 0.02313615381717682, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "distill_loss": 0.07941682636737823, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "ref_ce_loss": 0.04488234594464302, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "loss": 0.2585996985435486, + "step": 13360 + }, + { + "ce_loss": 0.06306672841310501, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "distill_loss": 0.11801082640886307, + "epoch": 4.456304202801868, + "step": 13360 + }, + { + "epoch": 4.456304202801868, + "ref_ce_loss": 0.07743985950946808, + "step": 13360 + }, + { + "epoch": 4.459639759839893, + "loss": 0.4529, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "grad_norm": 2.287713050842285, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "learning_rate": 0.00012944413573246698, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "loss": 0.3657965064048767, + "step": 13370 + }, + { + "ce_loss": 0.0648331418633461, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "distill_loss": 0.16222751140594482, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "ref_ce_loss": 0.06190655007958412, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "loss": 0.3106033205986023, + "step": 13370 + }, + { + "ce_loss": 0.09094034135341644, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "distill_loss": 0.1325746476650238, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "ref_ce_loss": 0.08697564899921417, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "loss": 0.20381583273410797, + "step": 13370 + }, + { + "ce_loss": 0.019039664417505264, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "distill_loss": 0.08726243674755096, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "ref_ce_loss": 0.0972718819975853, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "loss": 0.6112961769104004, + "step": 13370 + }, + { + "ce_loss": 0.04354558512568474, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "distill_loss": 0.24787333607673645, + "epoch": 4.459639759839893, + "step": 13370 + }, + { + "epoch": 4.459639759839893, + "ref_ce_loss": 0.10577059537172318, + "step": 13370 + }, + { + "epoch": 4.462975316877919, + "loss": 0.4255, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "grad_norm": 2.6293230056762695, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "learning_rate": 0.000129243504149287, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "loss": 0.19177551567554474, + "step": 13380 + }, + { + "ce_loss": 0.020500704646110535, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "distill_loss": 0.09150394052267075, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "ref_ce_loss": 0.051849473267793655, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "loss": 0.29290539026260376, + "step": 13380 + }, + { + "ce_loss": 0.09419353306293488, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "distill_loss": 0.11655114591121674, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "ref_ce_loss": 0.08180296421051025, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "loss": 0.8274011611938477, + "step": 13380 + }, + { + "ce_loss": 0.12826131284236908, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "distill_loss": 0.15427960455417633, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "ref_ce_loss": 0.1055775135755539, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "loss": 0.33117276430130005, + "step": 13380 + }, + { + "ce_loss": 0.05304085463285446, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "distill_loss": 0.12336459755897522, + "epoch": 4.462975316877919, + "step": 13380 + }, + { + "epoch": 4.462975316877919, + "ref_ce_loss": 0.07321083545684814, + "step": 13380 + }, + { + "epoch": 4.466310873915944, + "loss": 0.4772, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "grad_norm": 3.265866994857788, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "learning_rate": 0.00012904291041776776, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "loss": 0.33649423718452454, + "step": 13390 + }, + { + "ce_loss": 0.05174952372908592, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "distill_loss": 0.18453219532966614, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "ref_ce_loss": 0.10008276998996735, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "loss": 0.37133169174194336, + "step": 13390 + }, + { + "ce_loss": 0.1038256585597992, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "distill_loss": 0.1483224332332611, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "ref_ce_loss": 0.09805310517549515, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "loss": 0.7990421056747437, + "step": 13390 + }, + { + "ce_loss": 0.08086645603179932, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "distill_loss": 0.2780746817588806, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "ref_ce_loss": 0.13248445093631744, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "loss": 0.6483367085456848, + "step": 13390 + }, + { + "ce_loss": 0.17032165825366974, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "distill_loss": 0.3139309883117676, + "epoch": 4.466310873915944, + "step": 13390 + }, + { + "epoch": 4.466310873915944, + "ref_ce_loss": 0.11699211597442627, + "step": 13390 + }, + { + "epoch": 4.469646430953969, + "loss": 0.4952, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "grad_norm": 2.7196404933929443, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "learning_rate": 0.000128842354903713, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "loss": 0.26788756251335144, + "step": 13400 + }, + { + "ce_loss": 0.04564729705452919, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "distill_loss": 0.10269252955913544, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "ref_ce_loss": 0.07589396834373474, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "loss": 0.2653442919254303, + "step": 13400 + }, + { + "ce_loss": 0.04288773238658905, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "distill_loss": 0.098898746073246, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "ref_ce_loss": 0.07052404433488846, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "loss": 0.3910791575908661, + "step": 13400 + }, + { + "ce_loss": 0.08472683280706406, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "distill_loss": 0.17124629020690918, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "ref_ce_loss": 0.08802192658185959, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "loss": 0.4675942659378052, + "step": 13400 + }, + { + "ce_loss": 0.06608907133340836, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "distill_loss": 0.2451399713754654, + "epoch": 4.469646430953969, + "step": 13400 + }, + { + "epoch": 4.469646430953969, + "ref_ce_loss": 0.07252366840839386, + "step": 13400 + }, + { + "epoch": 4.472981987991995, + "loss": 0.5049, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "grad_norm": 3.4293053150177, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "learning_rate": 0.00012864183797285683, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "loss": 0.28266942501068115, + "step": 13410 + }, + { + "ce_loss": 0.08216257393360138, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "distill_loss": 0.13507524132728577, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "ref_ce_loss": 0.06497799605131149, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "loss": 0.5329622030258179, + "step": 13410 + }, + { + "ce_loss": 0.18671882152557373, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "distill_loss": 0.26308295130729675, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "ref_ce_loss": 0.06053365767002106, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "loss": 0.578106164932251, + "step": 13410 + }, + { + "ce_loss": 0.1150013655424118, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "distill_loss": 0.20448864996433258, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "ref_ce_loss": 0.09882330894470215, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "loss": 0.46086007356643677, + "step": 13410 + }, + { + "ce_loss": 0.08865619450807571, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "distill_loss": 0.2300991266965866, + "epoch": 4.472981987991995, + "step": 13410 + }, + { + "epoch": 4.472981987991995, + "ref_ce_loss": 0.07322938740253448, + "step": 13410 + }, + { + "epoch": 4.47631754503002, + "loss": 0.4451, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "grad_norm": 2.5083768367767334, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "learning_rate": 0.00012844135999086315, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "loss": 0.5072634220123291, + "step": 13420 + }, + { + "ce_loss": 0.13968051970005035, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "distill_loss": 0.27628523111343384, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "ref_ce_loss": 0.09119226783514023, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "loss": 0.23070460557937622, + "step": 13420 + }, + { + "ce_loss": 0.0296455267816782, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "distill_loss": 0.10874531418085098, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "ref_ce_loss": 0.04925838112831116, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "loss": 0.2778133749961853, + "step": 13420 + }, + { + "ce_loss": 0.04140105098485947, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "distill_loss": 0.16100528836250305, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "ref_ce_loss": 0.07529508322477341, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "loss": 0.3053801655769348, + "step": 13420 + }, + { + "ce_loss": 0.0789996087551117, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "distill_loss": 0.11323997378349304, + "epoch": 4.47631754503002, + "step": 13420 + }, + { + "epoch": 4.47631754503002, + "ref_ce_loss": 0.11304476112127304, + "step": 13420 + }, + { + "epoch": 4.4796531020680455, + "loss": 0.4349, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "grad_norm": 3.8242762088775635, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "learning_rate": 0.00012824092132332466, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "loss": 0.5172803401947021, + "step": 13430 + }, + { + "ce_loss": 0.05954471230506897, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "distill_loss": 0.12819169461727142, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "ref_ce_loss": 0.10265228897333145, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "loss": 0.28423821926116943, + "step": 13430 + }, + { + "ce_loss": 0.029237208887934685, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "distill_loss": 0.13489145040512085, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "ref_ce_loss": 0.06749539822340012, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "loss": 0.37082672119140625, + "step": 13430 + }, + { + "ce_loss": 0.08649688214063644, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "distill_loss": 0.1623936891555786, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "ref_ce_loss": 0.0803963840007782, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "loss": 0.48403871059417725, + "step": 13430 + }, + { + "ce_loss": 0.12726320326328278, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "distill_loss": 0.13687562942504883, + "epoch": 4.4796531020680455, + "step": 13430 + }, + { + "epoch": 4.4796531020680455, + "ref_ce_loss": 0.07841704040765762, + "step": 13430 + }, + { + "epoch": 4.482988659106071, + "loss": 0.4811, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "grad_norm": 2.50536847114563, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "learning_rate": 0.0001280405223357624, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "loss": 0.4957541227340698, + "step": 13440 + }, + { + "ce_loss": 0.10157718509435654, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "distill_loss": 0.16703274846076965, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "ref_ce_loss": 0.09913060814142227, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "loss": 0.26849082112312317, + "step": 13440 + }, + { + "ce_loss": 0.06595831364393234, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "distill_loss": 0.12347928434610367, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "ref_ce_loss": 0.05139641463756561, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "loss": 0.34890177845954895, + "step": 13440 + }, + { + "ce_loss": 0.01840231381356716, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "distill_loss": 0.21284881234169006, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "ref_ce_loss": 0.055508144199848175, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "loss": 0.370951384305954, + "step": 13440 + }, + { + "ce_loss": 0.11548206210136414, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "distill_loss": 0.17274267971515656, + "epoch": 4.482988659106071, + "step": 13440 + }, + { + "epoch": 4.482988659106071, + "ref_ce_loss": 0.08252322673797607, + "step": 13440 + }, + { + "epoch": 4.486324216144096, + "loss": 0.4699, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "grad_norm": 3.5319814682006836, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "learning_rate": 0.0001278401633936251, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "loss": 0.4721037745475769, + "step": 13450 + }, + { + "ce_loss": 0.06719991564750671, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "distill_loss": 0.16518472135066986, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "ref_ce_loss": 0.061171624809503555, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "loss": 0.3494679927825928, + "step": 13450 + }, + { + "ce_loss": 0.06874405592679977, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "distill_loss": 0.11198866367340088, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "ref_ce_loss": 0.10032968968153, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "loss": 0.500599205493927, + "step": 13450 + }, + { + "ce_loss": 0.09012137353420258, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "distill_loss": 0.28814423084259033, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "ref_ce_loss": 0.08558844774961472, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "loss": 0.6183328628540039, + "step": 13450 + }, + { + "ce_loss": 0.12911294400691986, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "distill_loss": 0.28603270649909973, + "epoch": 4.486324216144096, + "step": 13450 + }, + { + "epoch": 4.486324216144096, + "ref_ce_loss": 0.10391736030578613, + "step": 13450 + }, + { + "epoch": 4.4896597731821215, + "loss": 0.4655, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "grad_norm": 3.1925013065338135, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "learning_rate": 0.0001276398448622884, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "loss": 0.7418397665023804, + "step": 13460 + }, + { + "ce_loss": 0.18634061515331268, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "distill_loss": 0.21090039610862732, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "ref_ce_loss": 0.11397220939397812, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "loss": 0.41916927695274353, + "step": 13460 + }, + { + "ce_loss": 0.08889000862836838, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "distill_loss": 0.1984071582555771, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "ref_ce_loss": 0.09343430399894714, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "loss": 0.656537652015686, + "step": 13460 + }, + { + "ce_loss": 0.19183559715747833, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "distill_loss": 0.20771309733390808, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "ref_ce_loss": 0.09294997155666351, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "loss": 0.19820013642311096, + "step": 13460 + }, + { + "ce_loss": 0.028779538348317146, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "distill_loss": 0.11477810144424438, + "epoch": 4.4896597731821215, + "step": 13460 + }, + { + "epoch": 4.4896597731821215, + "ref_ce_loss": 0.05453171953558922, + "step": 13460 + }, + { + "epoch": 4.492995330220147, + "loss": 0.4577, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "grad_norm": 2.804248809814453, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "learning_rate": 0.00012743956710705435, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "loss": 0.45621517300605774, + "step": 13470 + }, + { + "ce_loss": 0.08536119759082794, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "distill_loss": 0.1456269472837448, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "ref_ce_loss": 0.0890783965587616, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "loss": 0.292806476354599, + "step": 13470 + }, + { + "ce_loss": 0.0762527659535408, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "distill_loss": 0.11498447507619858, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "ref_ce_loss": 0.10128685086965561, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "loss": 0.42637720704078674, + "step": 13470 + }, + { + "ce_loss": 0.07126825302839279, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "distill_loss": 0.14693793654441833, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "ref_ce_loss": 0.09059864282608032, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "loss": 0.628233790397644, + "step": 13470 + }, + { + "ce_loss": 0.07562445104122162, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "distill_loss": 0.15411150455474854, + "epoch": 4.492995330220147, + "step": 13470 + }, + { + "epoch": 4.492995330220147, + "ref_ce_loss": 0.08947544544935226, + "step": 13470 + }, + { + "epoch": 4.496330887258172, + "loss": 0.436, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "grad_norm": 3.4965808391571045, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "learning_rate": 0.0001272393304931505, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "loss": 0.5007912516593933, + "step": 13480 + }, + { + "ce_loss": 0.15746666491031647, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "distill_loss": 0.1947905421257019, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "ref_ce_loss": 0.11005079746246338, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "loss": 0.3622717261314392, + "step": 13480 + }, + { + "ce_loss": 0.07799629122018814, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "distill_loss": 0.1503952145576477, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "ref_ce_loss": 0.06716569513082504, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "loss": 0.5783126354217529, + "step": 13480 + }, + { + "ce_loss": 0.10873718559741974, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "distill_loss": 0.10999981313943863, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "ref_ce_loss": 0.11208885908126831, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "loss": 0.3097704350948334, + "step": 13480 + }, + { + "ce_loss": 0.04218291491270065, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "distill_loss": 0.14731380343437195, + "epoch": 4.496330887258172, + "step": 13480 + }, + { + "epoch": 4.496330887258172, + "ref_ce_loss": 0.07287292927503586, + "step": 13480 + }, + { + "epoch": 4.4996664442961976, + "loss": 0.4298, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "grad_norm": 3.667353630065918, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "learning_rate": 0.0001270391353857295, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "loss": 0.19863076508045197, + "step": 13490 + }, + { + "ce_loss": 0.038019660860300064, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "distill_loss": 0.09817139059305191, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "ref_ce_loss": 0.04153582826256752, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "loss": 0.42932868003845215, + "step": 13490 + }, + { + "ce_loss": 0.10347703844308853, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "distill_loss": 0.15128600597381592, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "ref_ce_loss": 0.10648830235004425, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "loss": 0.805569052696228, + "step": 13490 + }, + { + "ce_loss": 0.06474862992763519, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "distill_loss": 0.15280646085739136, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "ref_ce_loss": 0.13071759045124054, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "loss": 0.31424570083618164, + "step": 13490 + }, + { + "ce_loss": 0.06584881991147995, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "distill_loss": 0.09528250247240067, + "epoch": 4.4996664442961976, + "step": 13490 + }, + { + "epoch": 4.4996664442961976, + "ref_ce_loss": 0.10239201039075851, + "step": 13490 + }, + { + "epoch": 4.503002001334223, + "loss": 0.4227, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "grad_norm": 3.658917188644409, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "learning_rate": 0.00012683898214986824, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "loss": 0.3130754828453064, + "step": 13500 + }, + { + "ce_loss": 0.07128926366567612, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "distill_loss": 0.08057375252246857, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "ref_ce_loss": 0.06667958945035934, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "loss": 0.42011693120002747, + "step": 13500 + }, + { + "ce_loss": 0.10284280776977539, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "distill_loss": 0.215025395154953, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "ref_ce_loss": 0.10161525756120682, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "loss": 0.3984660506248474, + "step": 13500 + }, + { + "ce_loss": 0.06674403697252274, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "distill_loss": 0.2244340181350708, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "ref_ce_loss": 0.08058111369609833, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "loss": 0.4775744080543518, + "step": 13500 + }, + { + "ce_loss": 0.15970994532108307, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "distill_loss": 0.12742125988006592, + "epoch": 4.503002001334223, + "step": 13500 + }, + { + "epoch": 4.503002001334223, + "ref_ce_loss": 0.11462785303592682, + "step": 13500 + }, + { + "epoch": 4.506337558372248, + "loss": 0.4352, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "grad_norm": 2.5688297748565674, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "learning_rate": 0.00012663887115056723, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "loss": 0.34181031584739685, + "step": 13510 + }, + { + "ce_loss": 0.10207465291023254, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "distill_loss": 0.1244707852602005, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "ref_ce_loss": 0.06944452971220016, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "loss": 0.5946003794670105, + "step": 13510 + }, + { + "ce_loss": 0.14638981223106384, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "distill_loss": 0.2600780725479126, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "ref_ce_loss": 0.10639924556016922, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "loss": 0.4521411657333374, + "step": 13510 + }, + { + "ce_loss": 0.11947055160999298, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "distill_loss": 0.14773902297019958, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "ref_ce_loss": 0.12681178748607635, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "loss": 0.34964805841445923, + "step": 13510 + }, + { + "ce_loss": 0.13701759278774261, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "distill_loss": 0.14266565442085266, + "epoch": 4.506337558372248, + "step": 13510 + }, + { + "epoch": 4.506337558372248, + "ref_ce_loss": 0.06979183107614517, + "step": 13510 + }, + { + "epoch": 4.509673115410274, + "loss": 0.4043, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "grad_norm": 2.665954828262329, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "learning_rate": 0.00012643880275275005, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "loss": 0.6819088459014893, + "step": 13520 + }, + { + "ce_loss": 0.2538369596004486, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "distill_loss": 0.19348205626010895, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "ref_ce_loss": 0.14676028490066528, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "loss": 0.3843729496002197, + "step": 13520 + }, + { + "ce_loss": 0.09575576335191727, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "distill_loss": 0.16320499777793884, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "ref_ce_loss": 0.12518060207366943, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "loss": 0.3843967318534851, + "step": 13520 + }, + { + "ce_loss": 0.09873102605342865, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "distill_loss": 0.10267771035432816, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "ref_ce_loss": 0.08693916350603104, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "loss": 0.3445023000240326, + "step": 13520 + }, + { + "ce_loss": 0.06388503313064575, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "distill_loss": 0.1321995109319687, + "epoch": 4.509673115410274, + "step": 13520 + }, + { + "epoch": 4.509673115410274, + "ref_ce_loss": 0.10116295516490936, + "step": 13520 + }, + { + "epoch": 4.513008672448299, + "loss": 0.4466, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "grad_norm": 3.3427317142486572, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "learning_rate": 0.0001262387773212625, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "loss": 0.261531263589859, + "step": 13530 + }, + { + "ce_loss": 0.06258751451969147, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "distill_loss": 0.14119817316532135, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "ref_ce_loss": 0.057690706104040146, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "loss": 0.4364040791988373, + "step": 13530 + }, + { + "ce_loss": 0.03287011384963989, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "distill_loss": 0.11824694275856018, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "ref_ce_loss": 0.05607159063220024, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "loss": 0.48048606514930725, + "step": 13530 + }, + { + "ce_loss": 0.05228961631655693, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "distill_loss": 0.20537841320037842, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "ref_ce_loss": 0.08688047528266907, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "loss": 0.37082439661026, + "step": 13530 + }, + { + "ce_loss": 0.07386504113674164, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "distill_loss": 0.09237289428710938, + "epoch": 4.513008672448299, + "step": 13530 + }, + { + "epoch": 4.513008672448299, + "ref_ce_loss": 0.0847601592540741, + "step": 13530 + }, + { + "epoch": 4.516344229486324, + "loss": 0.4137, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "grad_norm": 3.44813871383667, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "learning_rate": 0.00012603879522087215, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "loss": 0.4075620174407959, + "step": 13540 + }, + { + "ce_loss": 0.1053367629647255, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "distill_loss": 0.15739335119724274, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "ref_ce_loss": 0.10203933715820312, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "loss": 0.6476325392723083, + "step": 13540 + }, + { + "ce_loss": 0.09174636751413345, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "distill_loss": 0.130633145570755, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "ref_ce_loss": 0.13086505234241486, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "loss": 0.44802364706993103, + "step": 13540 + }, + { + "ce_loss": 0.1039741113781929, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "distill_loss": 0.13582943379878998, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "ref_ce_loss": 0.06499588489532471, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "loss": 0.3249742090702057, + "step": 13540 + }, + { + "ce_loss": 0.11507577449083328, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "distill_loss": 0.11589515954256058, + "epoch": 4.516344229486324, + "step": 13540 + }, + { + "epoch": 4.516344229486324, + "ref_ce_loss": 0.09377795457839966, + "step": 13540 + }, + { + "epoch": 4.51967978652435, + "loss": 0.4084, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "grad_norm": 2.4113292694091797, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "learning_rate": 0.0001258388568162673, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "loss": 0.46115991473197937, + "step": 13550 + }, + { + "ce_loss": 0.08020874857902527, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "distill_loss": 0.12565740942955017, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "ref_ce_loss": 0.11783842742443085, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "loss": 0.3117343783378601, + "step": 13550 + }, + { + "ce_loss": 0.10067509114742279, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "distill_loss": 0.12151607871055603, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "ref_ce_loss": 0.054096437990665436, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "loss": 0.41671937704086304, + "step": 13550 + }, + { + "ce_loss": 0.12779854238033295, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "distill_loss": 0.14122280478477478, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "ref_ce_loss": 0.09169165790081024, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "loss": 0.4090811312198639, + "step": 13550 + }, + { + "ce_loss": 0.1314501315355301, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "distill_loss": 0.16391001641750336, + "epoch": 4.51967978652435, + "step": 13550 + }, + { + "epoch": 4.51967978652435, + "ref_ce_loss": 0.08914811909198761, + "step": 13550 + }, + { + "epoch": 4.523015343562375, + "loss": 0.4736, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "grad_norm": 3.5108258724212646, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "learning_rate": 0.00012563896247205685, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "loss": 0.41970306634902954, + "step": 13560 + }, + { + "ce_loss": 0.08534153550863266, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "distill_loss": 0.10883626341819763, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "ref_ce_loss": 0.05522970110177994, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "loss": 0.6811654567718506, + "step": 13560 + }, + { + "ce_loss": 0.2026192545890808, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "distill_loss": 0.2263200879096985, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "ref_ce_loss": 0.10827615857124329, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "loss": 0.43313559889793396, + "step": 13560 + }, + { + "ce_loss": 0.15236859023571014, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "distill_loss": 0.15981322526931763, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "ref_ce_loss": 0.07809644192457199, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "loss": 0.40003854036331177, + "step": 13560 + }, + { + "ce_loss": 0.051413632929325104, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "distill_loss": 0.12792935967445374, + "epoch": 4.523015343562375, + "step": 13560 + }, + { + "epoch": 4.523015343562375, + "ref_ce_loss": 0.06233259662985802, + "step": 13560 + }, + { + "epoch": 4.5263509006004, + "loss": 0.4263, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "grad_norm": 2.6565046310424805, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "learning_rate": 0.00012543911255276927, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "loss": 0.6125295162200928, + "step": 13570 + }, + { + "ce_loss": 0.174873948097229, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "distill_loss": 0.18027006089687347, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "ref_ce_loss": 0.10872701555490494, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "loss": 0.37513914704322815, + "step": 13570 + }, + { + "ce_loss": 0.05586402490735054, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "distill_loss": 0.11506953835487366, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "ref_ce_loss": 0.09433775395154953, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "loss": 0.43863236904144287, + "step": 13570 + }, + { + "ce_loss": 0.06320168077945709, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "distill_loss": 0.18999160826206207, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "ref_ce_loss": 0.09951197355985641, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "loss": 0.3416541814804077, + "step": 13570 + }, + { + "ce_loss": 0.07509807497262955, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "distill_loss": 0.1782594621181488, + "epoch": 4.5263509006004, + "step": 13570 + }, + { + "epoch": 4.5263509006004, + "ref_ce_loss": 0.0882348045706749, + "step": 13570 + }, + { + "epoch": 4.529686457638426, + "loss": 0.4296, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "grad_norm": 1.9833866357803345, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "learning_rate": 0.0001252393074228518, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "loss": 0.3686028718948364, + "step": 13580 + }, + { + "ce_loss": 0.0907389372587204, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "distill_loss": 0.14625293016433716, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "ref_ce_loss": 0.07061794400215149, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "loss": 0.8088022470474243, + "step": 13580 + }, + { + "ce_loss": 0.08733832836151123, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "distill_loss": 0.2592051029205322, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "ref_ce_loss": 0.08921769261360168, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "loss": 0.4520493745803833, + "step": 13580 + }, + { + "ce_loss": 0.09981293976306915, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "distill_loss": 0.22075609862804413, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "ref_ce_loss": 0.06416413187980652, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "loss": 0.33570596575737, + "step": 13580 + }, + { + "ce_loss": 0.0714380294084549, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "distill_loss": 0.15616613626480103, + "epoch": 4.529686457638426, + "step": 13580 + }, + { + "epoch": 4.529686457638426, + "ref_ce_loss": 0.07128936797380447, + "step": 13580 + }, + { + "epoch": 4.533022014676451, + "loss": 0.4352, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "grad_norm": 2.4279701709747314, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "learning_rate": 0.00012503954744667035, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "loss": 0.31245681643486023, + "step": 13590 + }, + { + "ce_loss": 0.07602875679731369, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "distill_loss": 0.1463480144739151, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "ref_ce_loss": 0.06806331127882004, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "loss": 0.3308563828468323, + "step": 13590 + }, + { + "ce_loss": 0.1174980103969574, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "distill_loss": 0.10723531991243362, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "ref_ce_loss": 0.08204618096351624, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "loss": 0.3874821960926056, + "step": 13590 + }, + { + "ce_loss": 0.05842968076467514, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "distill_loss": 0.21752431988716125, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "ref_ce_loss": 0.11144692450761795, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "loss": 0.5839598774909973, + "step": 13590 + }, + { + "ce_loss": 0.12129247188568115, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "distill_loss": 0.3265639841556549, + "epoch": 4.533022014676451, + "step": 13590 + }, + { + "epoch": 4.533022014676451, + "ref_ce_loss": 0.09380409121513367, + "step": 13590 + }, + { + "epoch": 4.536357571714476, + "loss": 0.4077, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "grad_norm": 2.1981921195983887, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "learning_rate": 0.00012483983298850832, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "loss": 0.4827296733856201, + "step": 13600 + }, + { + "ce_loss": 0.11267001181840897, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "distill_loss": 0.1192665547132492, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "ref_ce_loss": 0.09567270427942276, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "loss": 0.3647862374782562, + "step": 13600 + }, + { + "ce_loss": 0.14574190974235535, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "distill_loss": 0.12533891201019287, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "ref_ce_loss": 0.0631135031580925, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "loss": 0.5031948089599609, + "step": 13600 + }, + { + "ce_loss": 0.07701995223760605, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "distill_loss": 0.11162997782230377, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "ref_ce_loss": 0.0998823344707489, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "loss": 0.46570688486099243, + "step": 13600 + }, + { + "ce_loss": 0.16124644875526428, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "distill_loss": 0.2057582139968872, + "epoch": 4.536357571714476, + "step": 13600 + }, + { + "epoch": 4.536357571714476, + "ref_ce_loss": 0.09859221428632736, + "step": 13600 + }, + { + "epoch": 4.539693128752502, + "loss": 0.4824, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "grad_norm": 3.6256258487701416, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "learning_rate": 0.00012464016441256592, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "loss": 0.330820232629776, + "step": 13610 + }, + { + "ce_loss": 0.10562053322792053, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "distill_loss": 0.12647363543510437, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "ref_ce_loss": 0.07756177335977554, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "loss": 0.3412962555885315, + "step": 13610 + }, + { + "ce_loss": 0.09845242649316788, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "distill_loss": 0.1366013139486313, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "ref_ce_loss": 0.07171228528022766, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "loss": 0.6043596863746643, + "step": 13610 + }, + { + "ce_loss": 0.1571023017168045, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "distill_loss": 0.157630056142807, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "ref_ce_loss": 0.11741932481527328, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "loss": 0.3237883150577545, + "step": 13610 + }, + { + "ce_loss": 0.07001443207263947, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "distill_loss": 0.13885082304477692, + "epoch": 4.539693128752502, + "step": 13610 + }, + { + "epoch": 4.539693128752502, + "ref_ce_loss": 0.08538435399532318, + "step": 13610 + }, + { + "epoch": 4.543028685790527, + "loss": 0.4485, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "grad_norm": 3.25905442237854, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "learning_rate": 0.00012444054208296014, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "loss": 0.3815009295940399, + "step": 13620 + }, + { + "ce_loss": 0.11263687163591385, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "distill_loss": 0.13090786337852478, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "ref_ce_loss": 0.07116122543811798, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "loss": 0.39881637692451477, + "step": 13620 + }, + { + "ce_loss": 0.055635806173086166, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "distill_loss": 0.20073339343070984, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "ref_ce_loss": 0.0927748754620552, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "loss": 0.30770015716552734, + "step": 13620 + }, + { + "ce_loss": 0.07468118518590927, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "distill_loss": 0.11699899286031723, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "ref_ce_loss": 0.06440125405788422, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "loss": 0.3126216530799866, + "step": 13620 + }, + { + "ce_loss": 0.07507819682359695, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "distill_loss": 0.09823939204216003, + "epoch": 4.543028685790527, + "step": 13620 + }, + { + "epoch": 4.543028685790527, + "ref_ce_loss": 0.05781750753521919, + "step": 13620 + }, + { + "epoch": 4.5463642428285524, + "loss": 0.3936, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "grad_norm": 1.8271279335021973, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "learning_rate": 0.0001242409663637231, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "loss": 0.3869246542453766, + "step": 13630 + }, + { + "ce_loss": 0.11788243800401688, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "distill_loss": 0.13876157999038696, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "ref_ce_loss": 0.09515149891376495, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "loss": 0.2891252040863037, + "step": 13630 + }, + { + "ce_loss": 0.053451113402843475, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "distill_loss": 0.1296810507774353, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "ref_ce_loss": 0.05890681594610214, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "loss": 0.359307199716568, + "step": 13630 + }, + { + "ce_loss": 0.11626996845006943, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "distill_loss": 0.11499570310115814, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "ref_ce_loss": 0.1020270511507988, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "loss": 0.2649209499359131, + "step": 13630 + }, + { + "ce_loss": 0.05239448696374893, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "distill_loss": 0.12976151704788208, + "epoch": 4.5463642428285524, + "step": 13630 + }, + { + "epoch": 4.5463642428285524, + "ref_ce_loss": 0.05968295782804489, + "step": 13630 + }, + { + "epoch": 4.549699799866578, + "loss": 0.4434, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "grad_norm": 3.0977251529693604, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "learning_rate": 0.0001240414376188023, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "loss": 0.3558688759803772, + "step": 13640 + }, + { + "ce_loss": 0.09094174206256866, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "distill_loss": 0.11046818643808365, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "ref_ce_loss": 0.08839097619056702, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "loss": 0.7989137172698975, + "step": 13640 + }, + { + "ce_loss": 0.1515854001045227, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "distill_loss": 0.16707676649093628, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "ref_ce_loss": 0.08309073746204376, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "loss": 0.37850069999694824, + "step": 13640 + }, + { + "ce_loss": 0.13884668052196503, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "distill_loss": 0.16242651641368866, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "ref_ce_loss": 0.07711191475391388, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "loss": 0.37308332324028015, + "step": 13640 + }, + { + "ce_loss": 0.028073173016309738, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "distill_loss": 0.10945470631122589, + "epoch": 4.549699799866578, + "step": 13640 + }, + { + "epoch": 4.549699799866578, + "ref_ce_loss": 0.08089083433151245, + "step": 13640 + }, + { + "epoch": 4.553035356904603, + "loss": 0.4421, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "grad_norm": 2.5074403285980225, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "learning_rate": 0.0001238419562120596, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "loss": 0.25730282068252563, + "step": 13650 + }, + { + "ce_loss": 0.03727759048342705, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "distill_loss": 0.12253627926111221, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "ref_ce_loss": 0.05770736187696457, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "loss": 0.513620138168335, + "step": 13650 + }, + { + "ce_loss": 0.10222535580396652, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "distill_loss": 0.25187408924102783, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "ref_ce_loss": 0.1170971617102623, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "loss": 0.5014258623123169, + "step": 13650 + }, + { + "ce_loss": 0.2105833888053894, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "distill_loss": 0.16550350189208984, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "ref_ce_loss": 0.09165210276842117, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "loss": 0.567755937576294, + "step": 13650 + }, + { + "ce_loss": 0.1038329005241394, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "distill_loss": 0.31333285570144653, + "epoch": 4.553035356904603, + "step": 13650 + }, + { + "epoch": 4.553035356904603, + "ref_ce_loss": 0.12102946639060974, + "step": 13650 + }, + { + "epoch": 4.5563709139426285, + "loss": 0.4451, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "grad_norm": 3.6462759971618652, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "learning_rate": 0.00012364252250727012, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "loss": 0.6694972515106201, + "step": 13660 + }, + { + "ce_loss": 0.13008111715316772, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "distill_loss": 0.14958806335926056, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "ref_ce_loss": 0.13347876071929932, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "loss": 0.5252541899681091, + "step": 13660 + }, + { + "ce_loss": 0.13631699979305267, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "distill_loss": 0.13244640827178955, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "ref_ce_loss": 0.07643939554691315, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "loss": 0.7412728071212769, + "step": 13660 + }, + { + "ce_loss": 0.0762416198849678, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "distill_loss": 0.18756809830665588, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "ref_ce_loss": 0.0861452966928482, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "loss": 0.30000126361846924, + "step": 13660 + }, + { + "ce_loss": 0.056661494076251984, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "distill_loss": 0.11866292357444763, + "epoch": 4.5563709139426285, + "step": 13660 + }, + { + "epoch": 4.5563709139426285, + "ref_ce_loss": 0.06830247491598129, + "step": 13660 + }, + { + "epoch": 4.559706470980654, + "loss": 0.4842, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "grad_norm": 3.67041277885437, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "learning_rate": 0.00012344313686812248, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "loss": 0.41302490234375, + "step": 13670 + }, + { + "ce_loss": 0.1367223709821701, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "distill_loss": 0.15959084033966064, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "ref_ce_loss": 0.0791315883398056, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "loss": 0.3104380667209625, + "step": 13670 + }, + { + "ce_loss": 0.05096364766359329, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "distill_loss": 0.1786295473575592, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "ref_ce_loss": 0.06254564225673676, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "loss": 0.5430464148521423, + "step": 13670 + }, + { + "ce_loss": 0.06723407655954361, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "distill_loss": 0.13717561960220337, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "ref_ce_loss": 0.079865463078022, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "loss": 0.4139541685581207, + "step": 13670 + }, + { + "ce_loss": 0.10406375676393509, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "distill_loss": 0.1822187751531601, + "epoch": 4.559706470980654, + "step": 13670 + }, + { + "epoch": 4.559706470980654, + "ref_ce_loss": 0.09676679968833923, + "step": 13670 + }, + { + "epoch": 4.563042028018679, + "loss": 0.4609, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "grad_norm": 2.727271795272827, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "learning_rate": 0.00012324379965821734, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "loss": 0.5735003352165222, + "step": 13680 + }, + { + "ce_loss": 0.07929814606904984, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "distill_loss": 0.1739797592163086, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "ref_ce_loss": 0.10339117795228958, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "loss": 0.33152449131011963, + "step": 13680 + }, + { + "ce_loss": 0.1058422103524208, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "distill_loss": 0.11422500759363174, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "ref_ce_loss": 0.08426758646965027, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "loss": 0.5023914575576782, + "step": 13680 + }, + { + "ce_loss": 0.12481412291526794, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "distill_loss": 0.16830413043498993, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "ref_ce_loss": 0.09582826495170593, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "loss": 0.3103086054325104, + "step": 13680 + }, + { + "ce_loss": 0.08641095459461212, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "distill_loss": 0.14794382452964783, + "epoch": 4.563042028018679, + "step": 13680 + }, + { + "epoch": 4.563042028018679, + "ref_ce_loss": 0.07582417875528336, + "step": 13680 + }, + { + "epoch": 4.5663775850567045, + "loss": 0.4606, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "grad_norm": 10.296592712402344, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "learning_rate": 0.00012304451124106716, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "loss": 0.5351616144180298, + "step": 13690 + }, + { + "ce_loss": 0.22443951666355133, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "distill_loss": 0.1858910769224167, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "ref_ce_loss": 0.0964193195104599, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "loss": 0.42722010612487793, + "step": 13690 + }, + { + "ce_loss": 0.15494583547115326, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "distill_loss": 0.14783549308776855, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "ref_ce_loss": 0.08811517059803009, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "loss": 0.4499038755893707, + "step": 13690 + }, + { + "ce_loss": 0.08879373222589493, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "distill_loss": 0.17907992005348206, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "ref_ce_loss": 0.0889173224568367, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "loss": 0.6208651065826416, + "step": 13690 + }, + { + "ce_loss": 0.12071339040994644, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "distill_loss": 0.14878807961940765, + "epoch": 4.5663775850567045, + "step": 13690 + }, + { + "epoch": 4.5663775850567045, + "ref_ce_loss": 0.07070641219615936, + "step": 13690 + }, + { + "epoch": 4.56971314209473, + "loss": 0.4358, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "grad_norm": 3.6545889377593994, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "learning_rate": 0.00012284527198009543, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "loss": 0.43364912271499634, + "step": 13700 + }, + { + "ce_loss": 0.09774000942707062, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "distill_loss": 0.13901299238204956, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "ref_ce_loss": 0.08951815217733383, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "loss": 0.5613880753517151, + "step": 13700 + }, + { + "ce_loss": 0.09675523638725281, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "distill_loss": 0.23557351529598236, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "ref_ce_loss": 0.08532778918743134, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "loss": 0.245731383562088, + "step": 13700 + }, + { + "ce_loss": 0.017872940748929977, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "distill_loss": 0.08782623708248138, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "ref_ce_loss": 0.056050386279821396, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "loss": 0.30002671480178833, + "step": 13700 + }, + { + "ce_loss": 0.06583509594202042, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "distill_loss": 0.1712363362312317, + "epoch": 4.56971314209473, + "step": 13700 + }, + { + "epoch": 4.56971314209473, + "ref_ce_loss": 0.06252934783697128, + "step": 13700 + }, + { + "epoch": 4.573048699132755, + "loss": 0.4209, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "grad_norm": 2.177154064178467, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "learning_rate": 0.00012264608223863592, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "loss": 0.4023416042327881, + "step": 13710 + }, + { + "ce_loss": 0.140888050198555, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "distill_loss": 0.12739530205726624, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "ref_ce_loss": 0.1103469580411911, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "loss": 0.5931873917579651, + "step": 13710 + }, + { + "ce_loss": 0.10426094383001328, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "distill_loss": 0.2250000536441803, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "ref_ce_loss": 0.12573935091495514, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "loss": 0.35033118724823, + "step": 13710 + }, + { + "ce_loss": 0.07681325823068619, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "distill_loss": 0.19096848368644714, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "ref_ce_loss": 0.0824156329035759, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "loss": 0.47160404920578003, + "step": 13710 + }, + { + "ce_loss": 0.13833743333816528, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "distill_loss": 0.219387024641037, + "epoch": 4.573048699132755, + "step": 13710 + }, + { + "epoch": 4.573048699132755, + "ref_ce_loss": 0.08718808740377426, + "step": 13710 + }, + { + "epoch": 4.576384256170781, + "loss": 0.4349, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "grad_norm": 2.8919193744659424, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "learning_rate": 0.00012244694237993216, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "loss": 0.39218464493751526, + "step": 13720 + }, + { + "ce_loss": 0.14482086896896362, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "distill_loss": 0.15278075635433197, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "ref_ce_loss": 0.05385451018810272, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "loss": 0.8182839155197144, + "step": 13720 + }, + { + "ce_loss": 0.10439430177211761, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "distill_loss": 0.16038022935390472, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "ref_ce_loss": 0.09227900952100754, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "loss": 0.6853447556495667, + "step": 13720 + }, + { + "ce_loss": 0.07551288604736328, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "distill_loss": 0.15994273126125336, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "ref_ce_loss": 0.1423223465681076, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "loss": 0.303835928440094, + "step": 13720 + }, + { + "ce_loss": 0.062204133719205856, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "distill_loss": 0.10961860418319702, + "epoch": 4.576384256170781, + "step": 13720 + }, + { + "epoch": 4.576384256170781, + "ref_ce_loss": 0.07964453846216202, + "step": 13720 + }, + { + "epoch": 4.579719813208806, + "loss": 0.3986, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "grad_norm": 1.9763175249099731, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "learning_rate": 0.00012224785276713674, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "loss": 0.35807281732559204, + "step": 13730 + }, + { + "ce_loss": 0.08664276450872421, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "distill_loss": 0.14837504923343658, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "ref_ce_loss": 0.07494465261697769, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "loss": 0.2507583200931549, + "step": 13730 + }, + { + "ce_loss": 0.05459444224834442, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "distill_loss": 0.10137300193309784, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "ref_ce_loss": 0.09378074109554291, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "loss": 0.4497610330581665, + "step": 13730 + }, + { + "ce_loss": 0.12088881433010101, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "distill_loss": 0.18141990900039673, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "ref_ce_loss": 0.11048609763383865, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "loss": 0.46002933382987976, + "step": 13730 + }, + { + "ce_loss": 0.15615512430667877, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "distill_loss": 0.14017333090305328, + "epoch": 4.579719813208806, + "step": 13730 + }, + { + "epoch": 4.579719813208806, + "ref_ce_loss": 0.08091697841882706, + "step": 13730 + }, + { + "epoch": 4.583055370246831, + "loss": 0.4443, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "grad_norm": 1.916420340538025, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "learning_rate": 0.00012204881376331049, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "loss": 0.36558467149734497, + "step": 13740 + }, + { + "ce_loss": 0.07916125655174255, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "distill_loss": 0.16772039234638214, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "ref_ce_loss": 0.06706126779317856, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "loss": 0.4626239538192749, + "step": 13740 + }, + { + "ce_loss": 0.09908965229988098, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "distill_loss": 0.14869137108325958, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "ref_ce_loss": 0.14961369335651398, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "loss": 0.6350833773612976, + "step": 13740 + }, + { + "ce_loss": 0.09164462983608246, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "distill_loss": 0.13121801614761353, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "ref_ce_loss": 0.12443569302558899, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "loss": 0.25773707032203674, + "step": 13740 + }, + { + "ce_loss": 0.0695008710026741, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "distill_loss": 0.11608363687992096, + "epoch": 4.583055370246831, + "step": 13740 + }, + { + "epoch": 4.583055370246831, + "ref_ce_loss": 0.04488598555326462, + "step": 13740 + }, + { + "epoch": 4.586390927284857, + "loss": 0.475, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "grad_norm": 3.7855210304260254, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "learning_rate": 0.00012184982573142215, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "loss": 0.4133288264274597, + "step": 13750 + }, + { + "ce_loss": 0.09249895066022873, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "distill_loss": 0.1350885033607483, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "ref_ce_loss": 0.09329091012477875, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "loss": 0.2725394070148468, + "step": 13750 + }, + { + "ce_loss": 0.057737085968256, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "distill_loss": 0.12318053841590881, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "ref_ce_loss": 0.06361659616231918, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "loss": 0.2811312675476074, + "step": 13750 + }, + { + "ce_loss": 0.04427865892648697, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "distill_loss": 0.13684555888175964, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "ref_ce_loss": 0.05077182129025459, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "loss": 0.6474969387054443, + "step": 13750 + }, + { + "ce_loss": 0.08332732319831848, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "distill_loss": 0.13889549672603607, + "epoch": 4.586390927284857, + "step": 13750 + }, + { + "epoch": 4.586390927284857, + "ref_ce_loss": 0.06022493168711662, + "step": 13750 + }, + { + "epoch": 4.589726484322882, + "loss": 0.4507, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "grad_norm": 2.70395565032959, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "learning_rate": 0.00012165088903434731, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "loss": 0.19448912143707275, + "step": 13760 + }, + { + "ce_loss": 0.011985452845692635, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "distill_loss": 0.0980040431022644, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "ref_ce_loss": 0.08422990143299103, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "loss": 0.511165201663971, + "step": 13760 + }, + { + "ce_loss": 0.17377831041812897, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "distill_loss": 0.1677647829055786, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "ref_ce_loss": 0.10331190377473831, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "loss": 0.5306928157806396, + "step": 13760 + }, + { + "ce_loss": 0.17135898768901825, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "distill_loss": 0.20471972227096558, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "ref_ce_loss": 0.15429776906967163, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "loss": 0.5309099555015564, + "step": 13760 + }, + { + "ce_loss": 0.17451880872249603, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "distill_loss": 0.2045009434223175, + "epoch": 4.589726484322882, + "step": 13760 + }, + { + "epoch": 4.589726484322882, + "ref_ce_loss": 0.11921660602092743, + "step": 13760 + }, + { + "epoch": 4.593062041360907, + "loss": 0.4912, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "grad_norm": 3.7819948196411133, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "learning_rate": 0.00012145200403486805, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "loss": 0.6235246062278748, + "step": 13770 + }, + { + "ce_loss": 0.12689945101737976, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "distill_loss": 0.17795231938362122, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "ref_ce_loss": 0.10836771130561829, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "loss": 0.2900228798389435, + "step": 13770 + }, + { + "ce_loss": 0.053863294422626495, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "distill_loss": 0.13948102295398712, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "ref_ce_loss": 0.09636963158845901, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "loss": 0.5091549158096313, + "step": 13770 + }, + { + "ce_loss": 0.08481498062610626, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "distill_loss": 0.30344074964523315, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "ref_ce_loss": 0.12071350961923599, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "loss": 0.5841506719589233, + "step": 13770 + }, + { + "ce_loss": 0.1180991530418396, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "distill_loss": 0.21680428087711334, + "epoch": 4.593062041360907, + "step": 13770 + }, + { + "epoch": 4.593062041360907, + "ref_ce_loss": 0.11102721095085144, + "step": 13770 + }, + { + "epoch": 4.596397598398933, + "loss": 0.48, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "grad_norm": 2.3747878074645996, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "learning_rate": 0.00012125317109567219, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "loss": 0.7505519390106201, + "step": 13780 + }, + { + "ce_loss": 0.1606123447418213, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "distill_loss": 0.16520407795906067, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "ref_ce_loss": 0.08397366106510162, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "loss": 0.3478164076805115, + "step": 13780 + }, + { + "ce_loss": 0.07152732461690903, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "distill_loss": 0.1993105709552765, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "ref_ce_loss": 0.07670750468969345, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "loss": 0.5030295252799988, + "step": 13780 + }, + { + "ce_loss": 0.06702382862567902, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "distill_loss": 0.10921978205442429, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "ref_ce_loss": 0.10053457319736481, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "loss": 0.446050763130188, + "step": 13780 + }, + { + "ce_loss": 0.1185617446899414, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "distill_loss": 0.1715582013130188, + "epoch": 4.596397598398933, + "step": 13780 + }, + { + "epoch": 4.596397598398933, + "ref_ce_loss": 0.08671645820140839, + "step": 13780 + }, + { + "epoch": 4.599733155436958, + "loss": 0.4124, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "grad_norm": 3.6794545650482178, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "learning_rate": 0.00012105439057935254, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "loss": 0.3869558274745941, + "step": 13790 + }, + { + "ce_loss": 0.09533415734767914, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "distill_loss": 0.15595683455467224, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "ref_ce_loss": 0.10946252197027206, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "loss": 0.4473382830619812, + "step": 13790 + }, + { + "ce_loss": 0.11866164207458496, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "distill_loss": 0.21954916417598724, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "ref_ce_loss": 0.10898900777101517, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "loss": 0.23668749630451202, + "step": 13790 + }, + { + "ce_loss": 0.0502098873257637, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "distill_loss": 0.10353440046310425, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "ref_ce_loss": 0.047333549708127975, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "loss": 0.5408918857574463, + "step": 13790 + }, + { + "ce_loss": 0.12211053818464279, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "distill_loss": 0.15445077419281006, + "epoch": 4.599733155436958, + "step": 13790 + }, + { + "epoch": 4.599733155436958, + "ref_ce_loss": 0.08605307340621948, + "step": 13790 + }, + { + "epoch": 4.603068712474983, + "loss": 0.4387, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "grad_norm": 3.229055404663086, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "learning_rate": 0.00012085566284840637, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "loss": 0.2670734226703644, + "step": 13800 + }, + { + "ce_loss": 0.010470490902662277, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "distill_loss": 0.0766136422753334, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "ref_ce_loss": 0.05777127668261528, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "loss": 0.40359926223754883, + "step": 13800 + }, + { + "ce_loss": 0.19126923382282257, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "distill_loss": 0.1276741325855255, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "ref_ce_loss": 0.08455155044794083, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "loss": 0.7321145534515381, + "step": 13800 + }, + { + "ce_loss": 0.13351599872112274, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "distill_loss": 0.18421530723571777, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "ref_ce_loss": 0.09967406839132309, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "loss": 0.3720538318157196, + "step": 13800 + }, + { + "ce_loss": 0.11610039323568344, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "distill_loss": 0.14120493829250336, + "epoch": 4.603068712474983, + "step": 13800 + }, + { + "epoch": 4.603068712474983, + "ref_ce_loss": 0.11461121588945389, + "step": 13800 + }, + { + "epoch": 4.606404269513009, + "loss": 0.4275, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "grad_norm": 3.0860586166381836, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "learning_rate": 0.00012065698826523464, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "loss": 0.5441614985466003, + "step": 13810 + }, + { + "ce_loss": 0.14497758448123932, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "distill_loss": 0.14539915323257446, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "ref_ce_loss": 0.0962267741560936, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "loss": 0.2371405065059662, + "step": 13810 + }, + { + "ce_loss": 0.05888356268405914, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "distill_loss": 0.10163218528032303, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "ref_ce_loss": 0.0764048844575882, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "loss": 0.35190001130104065, + "step": 13810 + }, + { + "ce_loss": 0.1194930449128151, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "distill_loss": 0.11340652406215668, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "ref_ce_loss": 0.08773592859506607, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "loss": 0.3663141429424286, + "step": 13810 + }, + { + "ce_loss": 0.11494455486536026, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "distill_loss": 0.1567523181438446, + "epoch": 4.606404269513009, + "step": 13810 + }, + { + "epoch": 4.606404269513009, + "ref_ce_loss": 0.07022814452648163, + "step": 13810 + }, + { + "epoch": 4.609739826551034, + "loss": 0.4339, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "grad_norm": 4.00881290435791, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "learning_rate": 0.00012045836719214144, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "loss": 0.4647110402584076, + "step": 13820 + }, + { + "ce_loss": 0.12648433446884155, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "distill_loss": 0.19175873696804047, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "ref_ce_loss": 0.10510455071926117, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "loss": 0.31502780318260193, + "step": 13820 + }, + { + "ce_loss": 0.06370996683835983, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "distill_loss": 0.11913012713193893, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "ref_ce_loss": 0.09854836761951447, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "loss": 0.2350054383277893, + "step": 13820 + }, + { + "ce_loss": 0.0437438003718853, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "distill_loss": 0.13966906070709229, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "ref_ce_loss": 0.05152058228850365, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "loss": 0.5847965478897095, + "step": 13820 + }, + { + "ce_loss": 0.059979137033224106, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "distill_loss": 0.1415148377418518, + "epoch": 4.609739826551034, + "step": 13820 + }, + { + "epoch": 4.609739826551034, + "ref_ce_loss": 0.07900790870189667, + "step": 13820 + }, + { + "epoch": 4.613075383589059, + "loss": 0.4223, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "grad_norm": 2.820600986480713, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "learning_rate": 0.00012025979999133331, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "loss": 0.4595937728881836, + "step": 13830 + }, + { + "ce_loss": 0.1749647557735443, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "distill_loss": 0.13943815231323242, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "ref_ce_loss": 0.12155468761920929, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "loss": 0.6600459218025208, + "step": 13830 + }, + { + "ce_loss": 0.06968953460454941, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "distill_loss": 0.14609262347221375, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "ref_ce_loss": 0.08199404180049896, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "loss": 0.33543717861175537, + "step": 13830 + }, + { + "ce_loss": 0.05783466994762421, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "distill_loss": 0.11310224235057831, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "ref_ce_loss": 0.0892929807305336, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "loss": 0.3647262454032898, + "step": 13830 + }, + { + "ce_loss": 0.04082637280225754, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "distill_loss": 0.15222899615764618, + "epoch": 4.613075383589059, + "step": 13830 + }, + { + "epoch": 4.613075383589059, + "ref_ce_loss": 0.08344709873199463, + "step": 13830 + }, + { + "epoch": 4.616410940627085, + "loss": 0.4776, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "grad_norm": 5.2468366622924805, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "learning_rate": 0.00012006128702491837, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "loss": 0.18527640402317047, + "step": 13840 + }, + { + "ce_loss": 0.016801243647933006, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "distill_loss": 0.09196959435939789, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "ref_ce_loss": 0.07647021859884262, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "loss": 0.28422296047210693, + "step": 13840 + }, + { + "ce_loss": 0.0651751160621643, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "distill_loss": 0.09933413565158844, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "ref_ce_loss": 0.08113130927085876, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "loss": 0.5058305263519287, + "step": 13840 + }, + { + "ce_loss": 0.08947150409221649, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "distill_loss": 0.20171362161636353, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "ref_ce_loss": 0.10488651692867279, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "loss": 0.33015942573547363, + "step": 13840 + }, + { + "ce_loss": 0.04778118059039116, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "distill_loss": 0.09841583669185638, + "epoch": 4.616410940627085, + "step": 13840 + }, + { + "epoch": 4.616410940627085, + "ref_ce_loss": 0.08348551392555237, + "step": 13840 + }, + { + "epoch": 4.61974649766511, + "loss": 0.4282, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "grad_norm": 3.1201624870300293, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "learning_rate": 0.00011986282865490614, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "loss": 0.42631927132606506, + "step": 13850 + }, + { + "ce_loss": 0.12742312252521515, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "distill_loss": 0.159180149435997, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "ref_ce_loss": 0.10846816748380661, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "loss": 0.24269895255565643, + "step": 13850 + }, + { + "ce_loss": 0.022357333451509476, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "distill_loss": 0.10337289422750473, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "ref_ce_loss": 0.09756804257631302, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "loss": 0.27713820338249207, + "step": 13850 + }, + { + "ce_loss": 0.05909983813762665, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "distill_loss": 0.1180325597524643, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "ref_ce_loss": 0.0755968987941742, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "loss": 0.4052237868309021, + "step": 13850 + }, + { + "ce_loss": 0.05517116189002991, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "distill_loss": 0.13782203197479248, + "epoch": 4.61974649766511, + "step": 13850 + }, + { + "epoch": 4.61974649766511, + "ref_ce_loss": 0.08097584545612335, + "step": 13850 + }, + { + "epoch": 4.6230820547031355, + "loss": 0.421, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "grad_norm": 2.583071708679199, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "learning_rate": 0.00011966442524320619, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "loss": 0.36535245180130005, + "step": 13860 + }, + { + "ce_loss": 0.08213835209608078, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "distill_loss": 0.12975214421749115, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "ref_ce_loss": 0.039594732224941254, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "loss": 0.530299723148346, + "step": 13860 + }, + { + "ce_loss": 0.12899358570575714, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "distill_loss": 0.2598971128463745, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "ref_ce_loss": 0.09959632903337479, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "loss": 0.3098726272583008, + "step": 13860 + }, + { + "ce_loss": 0.07682958990335464, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "distill_loss": 0.16426794230937958, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "ref_ce_loss": 0.06860676407814026, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "loss": 0.2955167293548584, + "step": 13860 + }, + { + "ce_loss": 0.06630170345306396, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "distill_loss": 0.13955476880073547, + "epoch": 4.6230820547031355, + "step": 13860 + }, + { + "epoch": 4.6230820547031355, + "ref_ce_loss": 0.08948300033807755, + "step": 13860 + }, + { + "epoch": 4.626417611741161, + "loss": 0.3961, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "grad_norm": 2.267632007598877, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "learning_rate": 0.00011946607715162821, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "loss": 0.5063121318817139, + "step": 13870 + }, + { + "ce_loss": 0.07224904745817184, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "distill_loss": 0.21819202601909637, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "ref_ce_loss": 0.046448878943920135, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "loss": 0.33341118693351746, + "step": 13870 + }, + { + "ce_loss": 0.0745762288570404, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "distill_loss": 0.11459389328956604, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "ref_ce_loss": 0.09432224929332733, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "loss": 0.2235090583562851, + "step": 13870 + }, + { + "ce_loss": 0.04365307465195656, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "distill_loss": 0.11507236212491989, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "ref_ce_loss": 0.0628926232457161, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "loss": 0.17913806438446045, + "step": 13870 + }, + { + "ce_loss": 0.022245345637202263, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "distill_loss": 0.0938110202550888, + "epoch": 4.626417611741161, + "step": 13870 + }, + { + "epoch": 4.626417611741161, + "ref_ce_loss": 0.062074512243270874, + "step": 13870 + }, + { + "epoch": 4.629753168779186, + "loss": 0.4243, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "grad_norm": 1.9865474700927734, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "learning_rate": 0.00011926778474188093, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "loss": 0.5977710485458374, + "step": 13880 + }, + { + "ce_loss": 0.1861013025045395, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "distill_loss": 0.19886083900928497, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "ref_ce_loss": 0.12160538136959076, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "loss": 0.31760358810424805, + "step": 13880 + }, + { + "ce_loss": 0.09331942349672318, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "distill_loss": 0.10661468654870987, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "ref_ce_loss": 0.09620601683855057, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "loss": 0.29638510942459106, + "step": 13880 + }, + { + "ce_loss": 0.01876196824014187, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "distill_loss": 0.13782073557376862, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "ref_ce_loss": 0.08540710061788559, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "loss": 0.3348587453365326, + "step": 13880 + }, + { + "ce_loss": 0.12150032818317413, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "distill_loss": 0.1310514360666275, + "epoch": 4.629753168779186, + "step": 13880 + }, + { + "epoch": 4.629753168779186, + "ref_ce_loss": 0.05937032774090767, + "step": 13880 + }, + { + "epoch": 4.6330887258172115, + "loss": 0.4186, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "grad_norm": 2.33272385597229, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "learning_rate": 0.00011906954837557133, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "loss": 0.4339160919189453, + "step": 13890 + }, + { + "ce_loss": 0.10280374437570572, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "distill_loss": 0.15689747035503387, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "ref_ce_loss": 0.08752254396677017, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "loss": 0.2364230751991272, + "step": 13890 + }, + { + "ce_loss": 0.05747167766094208, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "distill_loss": 0.10423195362091064, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "ref_ce_loss": 0.0538666807115078, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "loss": 0.2561870515346527, + "step": 13890 + }, + { + "ce_loss": 0.06521957367658615, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "distill_loss": 0.10261288285255432, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "ref_ce_loss": 0.06124391034245491, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "loss": 0.266074538230896, + "step": 13890 + }, + { + "ce_loss": 0.06419584900140762, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "distill_loss": 0.09646572917699814, + "epoch": 4.6330887258172115, + "step": 13890 + }, + { + "epoch": 4.6330887258172115, + "ref_ce_loss": 0.0818057730793953, + "step": 13890 + }, + { + "epoch": 4.636424282855237, + "loss": 0.4187, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "grad_norm": 3.4036307334899902, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "learning_rate": 0.00011887136841420444, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "loss": 0.7854809761047363, + "step": 13900 + }, + { + "ce_loss": 0.1579110324382782, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "distill_loss": 0.17435112595558167, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "ref_ce_loss": 0.07334227114915848, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "loss": 0.7152835130691528, + "step": 13900 + }, + { + "ce_loss": 0.1721900850534439, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "distill_loss": 0.20881494879722595, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "ref_ce_loss": 0.11643655598163605, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "loss": 0.3511710464954376, + "step": 13900 + }, + { + "ce_loss": 0.09932536631822586, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "distill_loss": 0.13884766399860382, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "ref_ce_loss": 0.08952175080776215, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "loss": 0.6365158557891846, + "step": 13900 + }, + { + "ce_loss": 0.19059507548809052, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "distill_loss": 0.17020054161548615, + "epoch": 4.636424282855237, + "step": 13900 + }, + { + "epoch": 4.636424282855237, + "ref_ce_loss": 0.09225431084632874, + "step": 13900 + }, + { + "epoch": 4.639759839893262, + "loss": 0.4603, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "grad_norm": 2.318143367767334, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "learning_rate": 0.00011867324521918238, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "loss": 0.3799995183944702, + "step": 13910 + }, + { + "ce_loss": 0.13443692028522491, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "distill_loss": 0.17886750400066376, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "ref_ce_loss": 0.06636130809783936, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "loss": 0.4490954875946045, + "step": 13910 + }, + { + "ce_loss": 0.09709697216749191, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "distill_loss": 0.175789013504982, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "ref_ce_loss": 0.11594989150762558, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "loss": 0.7186691761016846, + "step": 13910 + }, + { + "ce_loss": 0.11297119408845901, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "distill_loss": 0.15328797698020935, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "ref_ce_loss": 0.07922860234975815, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "loss": 0.4605812728404999, + "step": 13910 + }, + { + "ce_loss": 0.12701410055160522, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "distill_loss": 0.16070149838924408, + "epoch": 4.639759839893262, + "step": 13910 + }, + { + "epoch": 4.639759839893262, + "ref_ce_loss": 0.0495823509991169, + "step": 13910 + }, + { + "epoch": 4.643095396931288, + "loss": 0.4562, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "grad_norm": 2.51172137260437, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "learning_rate": 0.00011847517915180356, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "loss": 0.41623979806900024, + "step": 13920 + }, + { + "ce_loss": 0.05802667886018753, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "distill_loss": 0.14537282288074493, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "ref_ce_loss": 0.063286691904068, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "loss": 0.8246922492980957, + "step": 13920 + }, + { + "ce_loss": 0.06929119676351547, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "distill_loss": 0.16589587926864624, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "ref_ce_loss": 0.10522410273551941, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "loss": 0.5103774666786194, + "step": 13920 + }, + { + "ce_loss": 0.1705876737833023, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "distill_loss": 0.15484222769737244, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "ref_ce_loss": 0.14860162138938904, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "loss": 0.46628522872924805, + "step": 13920 + }, + { + "ce_loss": 0.0842401385307312, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "distill_loss": 0.149397075176239, + "epoch": 4.643095396931288, + "step": 13920 + }, + { + "epoch": 4.643095396931288, + "ref_ce_loss": 0.0805220827460289, + "step": 13920 + }, + { + "epoch": 4.646430953969313, + "loss": 0.4727, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "grad_norm": 2.2505548000335693, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "learning_rate": 0.00011827717057326252, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "loss": 0.36407792568206787, + "step": 13930 + }, + { + "ce_loss": 0.11833661794662476, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "distill_loss": 0.15202414989471436, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "ref_ce_loss": 0.0678495541214943, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "loss": 0.3908737003803253, + "step": 13930 + }, + { + "ce_loss": 0.10677006840705872, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "distill_loss": 0.16376477479934692, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "ref_ce_loss": 0.11999862641096115, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "loss": 0.41768503189086914, + "step": 13930 + }, + { + "ce_loss": 0.09405351430177689, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "distill_loss": 0.16130514442920685, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "ref_ce_loss": 0.10390011221170425, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "loss": 0.39294177293777466, + "step": 13930 + }, + { + "ce_loss": 0.11008800566196442, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "distill_loss": 0.15608064830303192, + "epoch": 4.646430953969313, + "step": 13930 + }, + { + "epoch": 4.646430953969313, + "ref_ce_loss": 0.059509534388780594, + "step": 13930 + }, + { + "epoch": 4.649766511007338, + "loss": 0.3981, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "grad_norm": 3.809084177017212, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "learning_rate": 0.00011807921984464869, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "loss": 0.322648823261261, + "step": 13940 + }, + { + "ce_loss": 0.048746369779109955, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "distill_loss": 0.12906503677368164, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "ref_ce_loss": 0.055475905537605286, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "loss": 0.3334718942642212, + "step": 13940 + }, + { + "ce_loss": 0.07918932288885117, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "distill_loss": 0.14949554204940796, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "ref_ce_loss": 0.03752530738711357, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "loss": 0.37703388929367065, + "step": 13940 + }, + { + "ce_loss": 0.08964626491069794, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "distill_loss": 0.15478254854679108, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "ref_ce_loss": 0.09836571663618088, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "loss": 0.6053097248077393, + "step": 13940 + }, + { + "ce_loss": 0.09944993257522583, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "distill_loss": 0.18654851615428925, + "epoch": 4.649766511007338, + "step": 13940 + }, + { + "epoch": 4.649766511007338, + "ref_ce_loss": 0.11349543184041977, + "step": 13940 + }, + { + "epoch": 4.653102068045364, + "loss": 0.4453, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "grad_norm": 1.9943000078201294, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "learning_rate": 0.00011788132732694608, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "loss": 0.6359728574752808, + "step": 13950 + }, + { + "ce_loss": 0.07161174714565277, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "distill_loss": 0.13512277603149414, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "ref_ce_loss": 0.08939622342586517, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "loss": 0.3288179039955139, + "step": 13950 + }, + { + "ce_loss": 0.08766458183526993, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "distill_loss": 0.13691724836826324, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "ref_ce_loss": 0.0851324051618576, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "loss": 0.6143372058868408, + "step": 13950 + }, + { + "ce_loss": 0.15045179426670074, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "distill_loss": 0.1924908459186554, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "ref_ce_loss": 0.08594503998756409, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "loss": 0.533692479133606, + "step": 13950 + }, + { + "ce_loss": 0.11740151047706604, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "distill_loss": 0.14584587514400482, + "epoch": 4.653102068045364, + "step": 13950 + }, + { + "epoch": 4.653102068045364, + "ref_ce_loss": 0.06598098576068878, + "step": 13950 + }, + { + "epoch": 4.656437625083389, + "loss": 0.4665, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "grad_norm": 4.326551914215088, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "learning_rate": 0.00011768349338103273, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "loss": 0.38727375864982605, + "step": 13960 + }, + { + "ce_loss": 0.05691125616431236, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "distill_loss": 0.12060718238353729, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "ref_ce_loss": 0.0724659189581871, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "loss": 0.4936906397342682, + "step": 13960 + }, + { + "ce_loss": 0.12200823426246643, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "distill_loss": 0.1445818841457367, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "ref_ce_loss": 0.09555574506521225, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "loss": 0.5360593795776367, + "step": 13960 + }, + { + "ce_loss": 0.12914180755615234, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "distill_loss": 0.19759999215602875, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "ref_ce_loss": 0.1032886877655983, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "loss": 0.4051770865917206, + "step": 13960 + }, + { + "ce_loss": 0.1243601068854332, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "distill_loss": 0.13619369268417358, + "epoch": 4.656437625083389, + "step": 13960 + }, + { + "epoch": 4.656437625083389, + "ref_ce_loss": 0.10029909014701843, + "step": 13960 + }, + { + "epoch": 4.659773182121414, + "loss": 0.4264, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "grad_norm": 2.531710624694824, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "learning_rate": 0.0001174857183676796, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "loss": 0.3981442451477051, + "step": 13970 + }, + { + "ce_loss": 0.12004105746746063, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "distill_loss": 0.1404426544904709, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "ref_ce_loss": 0.07255041599273682, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "loss": 0.3417322635650635, + "step": 13970 + }, + { + "ce_loss": 0.12429323047399521, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "distill_loss": 0.13273431360721588, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "ref_ce_loss": 0.08454957604408264, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "loss": 0.3531622886657715, + "step": 13970 + }, + { + "ce_loss": 0.0692068412899971, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "distill_loss": 0.1651550829410553, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "ref_ce_loss": 0.11834469437599182, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "loss": 0.382656991481781, + "step": 13970 + }, + { + "ce_loss": 0.028870120644569397, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "distill_loss": 0.10546714067459106, + "epoch": 4.659773182121414, + "step": 13970 + }, + { + "epoch": 4.659773182121414, + "ref_ce_loss": 0.06734929233789444, + "step": 13970 + }, + { + "epoch": 4.66310873915944, + "loss": 0.4685, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "grad_norm": 4.019238471984863, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "learning_rate": 0.00011728800264755034, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "loss": 0.2976240813732147, + "step": 13980 + }, + { + "ce_loss": 0.06153253838419914, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "distill_loss": 0.131220281124115, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "ref_ce_loss": 0.06489714235067368, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "loss": 0.38370513916015625, + "step": 13980 + }, + { + "ce_loss": 0.10629656910896301, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "distill_loss": 0.14813432097434998, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "ref_ce_loss": 0.10886724293231964, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "loss": 0.29492947459220886, + "step": 13980 + }, + { + "ce_loss": 0.05098975449800491, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "distill_loss": 0.1699628233909607, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "ref_ce_loss": 0.07380944490432739, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "loss": 0.3338431417942047, + "step": 13980 + }, + { + "ce_loss": 0.08042466640472412, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "distill_loss": 0.11967265605926514, + "epoch": 4.66310873915944, + "step": 13980 + }, + { + "epoch": 4.66310873915944, + "ref_ce_loss": 0.06992650032043457, + "step": 13980 + }, + { + "epoch": 4.666444296197465, + "loss": 0.4189, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "grad_norm": 2.2785110473632812, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "learning_rate": 0.00011709034658120039, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "loss": 0.27386167645454407, + "step": 13990 + }, + { + "ce_loss": 0.0423194095492363, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "distill_loss": 0.12726286053657532, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "ref_ce_loss": 0.07899350672960281, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "loss": 0.506066083908081, + "step": 13990 + }, + { + "ce_loss": 0.09570929408073425, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "distill_loss": 0.14508602023124695, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "ref_ce_loss": 0.10210248827934265, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "loss": 0.3897585868835449, + "step": 13990 + }, + { + "ce_loss": 0.09135571867227554, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "distill_loss": 0.24521411955356598, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "ref_ce_loss": 0.053022027015686035, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "loss": 0.4636989235877991, + "step": 13990 + }, + { + "ce_loss": 0.10903631150722504, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "distill_loss": 0.183272123336792, + "epoch": 4.666444296197465, + "step": 13990 + }, + { + "epoch": 4.666444296197465, + "ref_ce_loss": 0.11822542548179626, + "step": 13990 + }, + { + "epoch": 4.66977985323549, + "loss": 0.4258, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "grad_norm": 2.021631956100464, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "learning_rate": 0.00011689275052907649, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "loss": 0.4199639856815338, + "step": 14000 + }, + { + "ce_loss": 0.09831275790929794, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "distill_loss": 0.17195957899093628, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "ref_ce_loss": 0.06019444018602371, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "loss": 0.34083402156829834, + "step": 14000 + }, + { + "ce_loss": 0.13644544780254364, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "distill_loss": 0.1311686784029007, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "ref_ce_loss": 0.0720614343881607, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "loss": 0.48071956634521484, + "step": 14000 + }, + { + "ce_loss": 0.1457836627960205, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "distill_loss": 0.20094430446624756, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "ref_ce_loss": 0.07603190094232559, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "loss": 0.34649357199668884, + "step": 14000 + }, + { + "ce_loss": 0.04942803084850311, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "distill_loss": 0.10252837836742401, + "epoch": 4.66977985323549, + "step": 14000 + }, + { + "epoch": 4.66977985323549, + "ref_ce_loss": 0.07332263886928558, + "step": 14000 + }, + { + "epoch": 4.673115410273516, + "loss": 0.4056, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "grad_norm": 2.8276865482330322, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "learning_rate": 0.00011669521485151591, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "loss": 0.6654629707336426, + "step": 14010 + }, + { + "ce_loss": 0.10808245092630386, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "distill_loss": 0.11504686623811722, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "ref_ce_loss": 0.07503039389848709, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "loss": 0.6763304471969604, + "step": 14010 + }, + { + "ce_loss": 0.1163989007472992, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "distill_loss": 0.19686204195022583, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "ref_ce_loss": 0.09545636177062988, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "loss": 0.2588968276977539, + "step": 14010 + }, + { + "ce_loss": 0.04675479605793953, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "distill_loss": 0.13797177374362946, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "ref_ce_loss": 0.0738506093621254, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "loss": 0.3704875707626343, + "step": 14010 + }, + { + "ce_loss": 0.12819942831993103, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "distill_loss": 0.11613549292087555, + "epoch": 4.673115410273516, + "step": 14010 + }, + { + "epoch": 4.673115410273516, + "ref_ce_loss": 0.09399432688951492, + "step": 14010 + }, + { + "epoch": 4.676450967311541, + "loss": 0.4743, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "grad_norm": 4.23807954788208, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "learning_rate": 0.00011649773990874573, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "loss": 0.3631729781627655, + "step": 14020 + }, + { + "ce_loss": 0.09874864667654037, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "distill_loss": 0.1832832247018814, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "ref_ce_loss": 0.08077353239059448, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "loss": 0.3315301835536957, + "step": 14020 + }, + { + "ce_loss": 0.030873116105794907, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "distill_loss": 0.16136936843395233, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "ref_ce_loss": 0.06957585364580154, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "loss": 0.5829925537109375, + "step": 14020 + }, + { + "ce_loss": 0.1614890992641449, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "distill_loss": 0.19390279054641724, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "ref_ce_loss": 0.09260150045156479, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "loss": 0.47367650270462036, + "step": 14020 + }, + { + "ce_loss": 0.06276272982358932, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "distill_loss": 0.13485144078731537, + "epoch": 4.676450967311541, + "step": 14020 + }, + { + "epoch": 4.676450967311541, + "ref_ce_loss": 0.09369198977947235, + "step": 14020 + }, + { + "epoch": 4.679786524349566, + "loss": 0.456, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "grad_norm": 3.837237596511841, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "learning_rate": 0.0001163003260608824, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "loss": 0.31302571296691895, + "step": 14030 + }, + { + "ce_loss": 0.09062153846025467, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "distill_loss": 0.10657159984111786, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "ref_ce_loss": 0.08325086534023285, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "loss": 0.7584108114242554, + "step": 14030 + }, + { + "ce_loss": 0.09911534935235977, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "distill_loss": 0.12623099982738495, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "ref_ce_loss": 0.14725255966186523, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "loss": 0.5008531808853149, + "step": 14030 + }, + { + "ce_loss": 0.10759527236223221, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "distill_loss": 0.14299361407756805, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "ref_ce_loss": 0.11691004782915115, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "loss": 0.2524974048137665, + "step": 14030 + }, + { + "ce_loss": 0.03561868518590927, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "distill_loss": 0.13155224919319153, + "epoch": 4.679786524349566, + "step": 14030 + }, + { + "epoch": 4.679786524349566, + "ref_ce_loss": 0.04909590631723404, + "step": 14030 + }, + { + "epoch": 4.683122081387592, + "loss": 0.4802, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "grad_norm": 2.0721302032470703, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "learning_rate": 0.00011610297366793094, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "loss": 0.22465814650058746, + "step": 14040 + }, + { + "ce_loss": 0.06050679087638855, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "distill_loss": 0.08898370712995529, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "ref_ce_loss": 0.07474206387996674, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "loss": 0.8398646116256714, + "step": 14040 + }, + { + "ce_loss": 0.08319870382547379, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "distill_loss": 0.1809157282114029, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "ref_ce_loss": 0.08680960536003113, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "loss": 0.5027204155921936, + "step": 14040 + }, + { + "ce_loss": 0.1770513951778412, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "distill_loss": 0.20788423717021942, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "ref_ce_loss": 0.09297391772270203, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "loss": 0.30045297741889954, + "step": 14040 + }, + { + "ce_loss": 0.10085663199424744, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "distill_loss": 0.09326339513063431, + "epoch": 4.683122081387592, + "step": 14040 + }, + { + "epoch": 4.683122081387592, + "ref_ce_loss": 0.10611985623836517, + "step": 14040 + }, + { + "epoch": 4.686457638425617, + "loss": 0.4225, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "grad_norm": 2.194854736328125, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "learning_rate": 0.00011590568308978418, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "loss": 0.2557274401187897, + "step": 14050 + }, + { + "ce_loss": 0.0705738216638565, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "distill_loss": 0.09358223527669907, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "ref_ce_loss": 0.046359218657016754, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "loss": 0.3328011631965637, + "step": 14050 + }, + { + "ce_loss": 0.03167080879211426, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "distill_loss": 0.17716750502586365, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "ref_ce_loss": 0.08014225959777832, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "loss": 0.5323995351791382, + "step": 14050 + }, + { + "ce_loss": 0.14414207637310028, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "distill_loss": 0.15471740067005157, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "ref_ce_loss": 0.08761541545391083, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "loss": 0.3618018627166748, + "step": 14050 + }, + { + "ce_loss": 0.06889332085847855, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "distill_loss": 0.14258632063865662, + "epoch": 4.686457638425617, + "step": 14050 + }, + { + "epoch": 4.686457638425617, + "ref_ce_loss": 0.05323160067200661, + "step": 14050 + }, + { + "epoch": 4.6897931954636425, + "loss": 0.4491, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "grad_norm": 2.5217649936676025, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "learning_rate": 0.0001157084546862224, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "loss": 0.4820738434791565, + "step": 14060 + }, + { + "ce_loss": 0.09973058849573135, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "distill_loss": 0.13066692650318146, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "ref_ce_loss": 0.10295473039150238, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "loss": 0.9242709875106812, + "step": 14060 + }, + { + "ce_loss": 0.12287642806768417, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "distill_loss": 0.14985498785972595, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "ref_ce_loss": 0.09447828680276871, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "loss": 0.6546158790588379, + "step": 14060 + }, + { + "ce_loss": 0.15501223504543304, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "distill_loss": 0.12218710035085678, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "ref_ce_loss": 0.10068108886480331, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "loss": 0.3825370669364929, + "step": 14060 + }, + { + "ce_loss": 0.1286890059709549, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "distill_loss": 0.15739960968494415, + "epoch": 4.6897931954636425, + "step": 14060 + }, + { + "epoch": 4.6897931954636425, + "ref_ce_loss": 0.09618832170963287, + "step": 14060 + }, + { + "epoch": 4.693128752501668, + "loss": 0.424, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "grad_norm": 2.1217305660247803, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "learning_rate": 0.00011551128881691231, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "loss": 0.41536468267440796, + "step": 14070 + }, + { + "ce_loss": 0.14819218218326569, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "distill_loss": 0.14639030396938324, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "ref_ce_loss": 0.12024623900651932, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "loss": 0.23171193897724152, + "step": 14070 + }, + { + "ce_loss": 0.05471116676926613, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "distill_loss": 0.11721976846456528, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "ref_ce_loss": 0.0596269890666008, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "loss": 0.8457027673721313, + "step": 14070 + }, + { + "ce_loss": 0.13278742134571075, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "distill_loss": 0.216340571641922, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "ref_ce_loss": 0.11654487252235413, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "loss": 0.5509470701217651, + "step": 14070 + }, + { + "ce_loss": 0.08814974129199982, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "distill_loss": 0.17692141234874725, + "epoch": 4.693128752501668, + "step": 14070 + }, + { + "epoch": 4.693128752501668, + "ref_ce_loss": 0.1137252077460289, + "step": 14070 + }, + { + "epoch": 4.696464309539693, + "loss": 0.4754, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "grad_norm": 3.004528045654297, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "learning_rate": 0.00011531418584140673, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "loss": 0.479631245136261, + "step": 14080 + }, + { + "ce_loss": 0.10905808955430984, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "distill_loss": 0.14824768900871277, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "ref_ce_loss": 0.10647785663604736, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "loss": 0.35922229290008545, + "step": 14080 + }, + { + "ce_loss": 0.05895942449569702, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "distill_loss": 0.14783093333244324, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "ref_ce_loss": 0.07992715388536453, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "loss": 0.33700183033943176, + "step": 14080 + }, + { + "ce_loss": 0.06836721301078796, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "distill_loss": 0.17993780970573425, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "ref_ce_loss": 0.06357478350400925, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "loss": 0.23770642280578613, + "step": 14080 + }, + { + "ce_loss": 0.061131224036216736, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "distill_loss": 0.11508830636739731, + "epoch": 4.696464309539693, + "step": 14080 + }, + { + "epoch": 4.696464309539693, + "ref_ce_loss": 0.051302142441272736, + "step": 14080 + }, + { + "epoch": 4.6997998665777185, + "loss": 0.4443, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "grad_norm": 2.966629981994629, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "learning_rate": 0.00011511714611914378, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "loss": 0.27354520559310913, + "step": 14090 + }, + { + "ce_loss": 0.017260253429412842, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "distill_loss": 0.0955350324511528, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "ref_ce_loss": 0.05125044286251068, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "loss": 0.453208863735199, + "step": 14090 + }, + { + "ce_loss": 0.058098290115594864, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "distill_loss": 0.13044509291648865, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "ref_ce_loss": 0.07986553758382797, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "loss": 0.38690540194511414, + "step": 14090 + }, + { + "ce_loss": 0.13304416835308075, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "distill_loss": 0.1314799189567566, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "ref_ce_loss": 0.08153283596038818, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "loss": 0.35225972533226013, + "step": 14090 + }, + { + "ce_loss": 0.08277490735054016, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "distill_loss": 0.1228342354297638, + "epoch": 4.6997998665777185, + "step": 14090 + }, + { + "epoch": 4.6997998665777185, + "ref_ce_loss": 0.09828595072031021, + "step": 14090 + }, + { + "epoch": 4.703135423615744, + "loss": 0.4201, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "grad_norm": 2.899693250656128, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "learning_rate": 0.00011492017000944613, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "loss": 0.38401275873184204, + "step": 14100 + }, + { + "ce_loss": 0.10745421797037125, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "distill_loss": 0.17753872275352478, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "ref_ce_loss": 0.09878189116716385, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "loss": 0.41860488057136536, + "step": 14100 + }, + { + "ce_loss": 0.1281246542930603, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "distill_loss": 0.10887651890516281, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "ref_ce_loss": 0.07744525372982025, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "loss": 0.5172537565231323, + "step": 14100 + }, + { + "ce_loss": 0.12804587185382843, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "distill_loss": 0.12572410702705383, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "ref_ce_loss": 0.0798870399594307, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "loss": 0.3483532965183258, + "step": 14100 + }, + { + "ce_loss": 0.06657369434833527, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "distill_loss": 0.11907947063446045, + "epoch": 4.703135423615744, + "step": 14100 + }, + { + "epoch": 4.703135423615744, + "ref_ce_loss": 0.064053475856781, + "step": 14100 + }, + { + "epoch": 4.706470980653769, + "loss": 0.4565, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "grad_norm": 2.73425555229187, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "learning_rate": 0.00011472325787152053, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "loss": 0.37849685549736023, + "step": 14110 + }, + { + "ce_loss": 0.09926678240299225, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "distill_loss": 0.1470641791820526, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "ref_ce_loss": 0.05578787997364998, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "loss": 0.33840328454971313, + "step": 14110 + }, + { + "ce_loss": 0.06550583243370056, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "distill_loss": 0.14875485002994537, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "ref_ce_loss": 0.05572007596492767, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "loss": 0.275194376707077, + "step": 14110 + }, + { + "ce_loss": 0.018452441319823265, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "distill_loss": 0.08852989971637726, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "ref_ce_loss": 0.032092366367578506, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "loss": 0.39704009890556335, + "step": 14110 + }, + { + "ce_loss": 0.07161843031644821, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "distill_loss": 0.17728634178638458, + "epoch": 4.706470980653769, + "step": 14110 + }, + { + "epoch": 4.706470980653769, + "ref_ce_loss": 0.10999871045351028, + "step": 14110 + }, + { + "epoch": 4.709806537691795, + "loss": 0.4662, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "grad_norm": 10.664507865905762, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "learning_rate": 0.000114526410064457, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "loss": 0.25680869817733765, + "step": 14120 + }, + { + "ce_loss": 0.07057490199804306, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "distill_loss": 0.12687957286834717, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "ref_ce_loss": 0.05930490419268608, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "loss": 0.3404933512210846, + "step": 14120 + }, + { + "ce_loss": 0.07909941673278809, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "distill_loss": 0.11669367551803589, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "ref_ce_loss": 0.08591524511575699, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "loss": 0.39324355125427246, + "step": 14120 + }, + { + "ce_loss": 0.0727115124464035, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "distill_loss": 0.18177631497383118, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "ref_ce_loss": 0.06662489473819733, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "loss": 0.2780805230140686, + "step": 14120 + }, + { + "ce_loss": 0.07770053297281265, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "distill_loss": 0.11451657861471176, + "epoch": 4.709806537691795, + "step": 14120 + }, + { + "epoch": 4.709806537691795, + "ref_ce_loss": 0.08560647070407867, + "step": 14120 + }, + { + "epoch": 4.71314209472982, + "loss": 1.3647, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "grad_norm": 11.033317565917969, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "learning_rate": 0.00011432962694722833, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "loss": 2.30210542678833, + "step": 14130 + }, + { + "ce_loss": 1.3777077198028564, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "distill_loss": 0.17551563680171967, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "ref_ce_loss": 0.6524390578269958, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "loss": 2.1025567054748535, + "step": 14130 + }, + { + "ce_loss": 1.2161785364151, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "distill_loss": 0.14761461317539215, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "ref_ce_loss": 0.6293615102767944, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "loss": 1.9151501655578613, + "step": 14130 + }, + { + "ce_loss": 1.1945534944534302, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "distill_loss": 0.1268174946308136, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "ref_ce_loss": 0.5792096257209778, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "loss": 2.032562494277954, + "step": 14130 + }, + { + "ce_loss": 1.1563758850097656, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "distill_loss": 0.15413472056388855, + "epoch": 4.71314209472982, + "step": 14130 + }, + { + "epoch": 4.71314209472982, + "ref_ce_loss": 0.6570587158203125, + "step": 14130 + }, + { + "epoch": 4.716477651767845, + "loss": 1.1888, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "grad_norm": 279.4869689941406, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "learning_rate": 0.00011413290887868933, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "loss": 0.9591177105903625, + "step": 14140 + }, + { + "ce_loss": 0.3586890697479248, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "distill_loss": 0.13500162959098816, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "ref_ce_loss": 0.27306056022644043, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "loss": 0.5405594110488892, + "step": 14140 + }, + { + "ce_loss": 0.20202241837978363, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "distill_loss": 0.141360804438591, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "ref_ce_loss": 0.1429208666086197, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "loss": 0.5801871418952942, + "step": 14140 + }, + { + "ce_loss": 0.21308133006095886, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "distill_loss": 0.15593229234218597, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "ref_ce_loss": 0.16659733653068542, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "loss": 0.8410604000091553, + "step": 14140 + }, + { + "ce_loss": 0.41682738065719604, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "distill_loss": 0.14686113595962524, + "epoch": 4.716477651767845, + "step": 14140 + }, + { + "epoch": 4.716477651767845, + "ref_ce_loss": 0.2372933030128479, + "step": 14140 + }, + { + "epoch": 4.719813208805871, + "loss": 0.6165, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "grad_norm": 2.762190818786621, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "learning_rate": 0.00011393625621757609, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "loss": 0.39677220582962036, + "step": 14150 + }, + { + "ce_loss": 0.14377433061599731, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "distill_loss": 0.12296594679355621, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "ref_ce_loss": 0.09659332036972046, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "loss": 0.42406103014945984, + "step": 14150 + }, + { + "ce_loss": 0.1596388965845108, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "distill_loss": 0.12342248857021332, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "ref_ce_loss": 0.11014783382415771, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "loss": 0.5634651184082031, + "step": 14150 + }, + { + "ce_loss": 0.22476035356521606, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "distill_loss": 0.15548166632652283, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "ref_ce_loss": 0.15671610832214355, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "loss": 0.4034387767314911, + "step": 14150 + }, + { + "ce_loss": 0.19524210691452026, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "distill_loss": 0.09127781540155411, + "epoch": 4.719813208805871, + "step": 14150 + }, + { + "epoch": 4.719813208805871, + "ref_ce_loss": 0.1166234090924263, + "step": 14150 + }, + { + "epoch": 4.723148765843896, + "loss": 0.5712, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "grad_norm": 2.8528082370758057, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "learning_rate": 0.00011373966932250552, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "loss": 0.9441983699798584, + "step": 14160 + }, + { + "ce_loss": 0.21303069591522217, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "distill_loss": 0.19046366214752197, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "ref_ce_loss": 0.15359333157539368, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "loss": 0.5612851977348328, + "step": 14160 + }, + { + "ce_loss": 0.29807958006858826, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "distill_loss": 0.14131425321102142, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "ref_ce_loss": 0.1210092157125473, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "loss": 0.5455232262611389, + "step": 14160 + }, + { + "ce_loss": 0.19224156439304352, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "distill_loss": 0.11192300915718079, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "ref_ce_loss": 0.1534367799758911, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "loss": 0.46232539415359497, + "step": 14160 + }, + { + "ce_loss": 0.13547080755233765, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "distill_loss": 0.18183527886867523, + "epoch": 4.723148765843896, + "step": 14160 + }, + { + "epoch": 4.723148765843896, + "ref_ce_loss": 0.14482565224170685, + "step": 14160 + }, + { + "epoch": 4.726484322881921, + "loss": 0.5941, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "grad_norm": 2.678342580795288, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "learning_rate": 0.0001135431485519746, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "loss": 0.44022274017333984, + "step": 14170 + }, + { + "ce_loss": 0.10185471922159195, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "distill_loss": 0.10535676032304764, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "ref_ce_loss": 0.10007350146770477, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "loss": 0.6223923563957214, + "step": 14170 + }, + { + "ce_loss": 0.2227141410112381, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "distill_loss": 0.18482142686843872, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "ref_ce_loss": 0.18490292131900787, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "loss": 0.6841763854026794, + "step": 14170 + }, + { + "ce_loss": 0.2221713364124298, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "distill_loss": 0.19176220893859863, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "ref_ce_loss": 0.14797654747962952, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "loss": 0.5130280256271362, + "step": 14170 + }, + { + "ce_loss": 0.18819832801818848, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "distill_loss": 0.16560444235801697, + "epoch": 4.726484322881921, + "step": 14170 + }, + { + "epoch": 4.726484322881921, + "ref_ce_loss": 0.1364285945892334, + "step": 14170 + }, + { + "epoch": 4.729819879919947, + "loss": 0.4885, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "grad_norm": 2.7698051929473877, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "learning_rate": 0.00011334669426435963, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "loss": 0.8724526166915894, + "step": 14180 + }, + { + "ce_loss": 0.2848694622516632, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "distill_loss": 0.13748759031295776, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "ref_ce_loss": 0.16300438344478607, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "loss": 0.40946438908576965, + "step": 14180 + }, + { + "ce_loss": 0.1276213824748993, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "distill_loss": 0.15523076057434082, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "ref_ce_loss": 0.082614965736866, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "loss": 0.46860525012016296, + "step": 14180 + }, + { + "ce_loss": 0.2063319981098175, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "distill_loss": 0.12221656739711761, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "ref_ce_loss": 0.1397939771413803, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "loss": 0.3337191641330719, + "step": 14180 + }, + { + "ce_loss": 0.0916631668806076, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "distill_loss": 0.122122623026371, + "epoch": 4.729819879919947, + "step": 14180 + }, + { + "epoch": 4.729819879919947, + "ref_ce_loss": 0.11975608766078949, + "step": 14180 + }, + { + "epoch": 4.733155436957972, + "loss": 0.5128, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "grad_norm": 2.4588358402252197, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "learning_rate": 0.00011315030681791585, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "loss": 0.5669198036193848, + "step": 14190 + }, + { + "ce_loss": 0.2199530005455017, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "distill_loss": 0.13584327697753906, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "ref_ce_loss": 0.13840018212795258, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "loss": 0.5684414505958557, + "step": 14190 + }, + { + "ce_loss": 0.19079327583312988, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "distill_loss": 0.1567957103252411, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "ref_ce_loss": 0.18704859912395477, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "loss": 0.36393260955810547, + "step": 14190 + }, + { + "ce_loss": 0.12109008431434631, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "distill_loss": 0.15616634488105774, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "ref_ce_loss": 0.061299532651901245, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "loss": 0.4499196410179138, + "step": 14190 + }, + { + "ce_loss": 0.11336159706115723, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "distill_loss": 0.11849113553762436, + "epoch": 4.733155436957972, + "step": 14190 + }, + { + "epoch": 4.733155436957972, + "ref_ce_loss": 0.08597545325756073, + "step": 14190 + }, + { + "epoch": 4.736490993995997, + "loss": 0.4978, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "grad_norm": 3.0749502182006836, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "learning_rate": 0.00011295398657077633, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "loss": 0.383294939994812, + "step": 14200 + }, + { + "ce_loss": 0.14222989976406097, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "distill_loss": 0.10870449244976044, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "ref_ce_loss": 0.09834384173154831, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "loss": 0.228814497590065, + "step": 14200 + }, + { + "ce_loss": 0.05184551328420639, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "distill_loss": 0.10386732220649719, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "ref_ce_loss": 0.0729263499379158, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "loss": 0.37187135219573975, + "step": 14200 + }, + { + "ce_loss": 0.09685519337654114, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "distill_loss": 0.09510515630245209, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "ref_ce_loss": 0.09703058749437332, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "loss": 0.27483007311820984, + "step": 14200 + }, + { + "ce_loss": 0.08314543217420578, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "distill_loss": 0.09368523210287094, + "epoch": 4.736490993995997, + "step": 14200 + }, + { + "epoch": 4.736490993995997, + "ref_ce_loss": 0.07162053138017654, + "step": 14200 + }, + { + "epoch": 4.739826551034023, + "loss": 0.4949, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "grad_norm": 1.9191662073135376, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "learning_rate": 0.00011275773388095185, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "loss": 0.43043410778045654, + "step": 14210 + }, + { + "ce_loss": 0.10714246332645416, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "distill_loss": 0.10732196271419525, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "ref_ce_loss": 0.06809937208890915, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "loss": 0.3872106671333313, + "step": 14210 + }, + { + "ce_loss": 0.09992988407611847, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "distill_loss": 0.10379815101623535, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "ref_ce_loss": 0.10170990973711014, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "loss": 0.40114355087280273, + "step": 14210 + }, + { + "ce_loss": 0.10342927277088165, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "distill_loss": 0.12139730900526047, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "ref_ce_loss": 0.08242413401603699, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "loss": 0.5521736741065979, + "step": 14210 + }, + { + "ce_loss": 0.11475184559822083, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "distill_loss": 0.14817777276039124, + "epoch": 4.739826551034023, + "step": 14210 + }, + { + "epoch": 4.739826551034023, + "ref_ce_loss": 0.1247396469116211, + "step": 14210 + }, + { + "epoch": 4.743162108072048, + "loss": 0.5387, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "grad_norm": 2.4396839141845703, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "learning_rate": 0.00011256154910632998, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "loss": 0.9258735179901123, + "step": 14220 + }, + { + "ce_loss": 0.23057223856449127, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "distill_loss": 0.162357360124588, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "ref_ce_loss": 0.1291550099849701, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "loss": 0.5556474924087524, + "step": 14220 + }, + { + "ce_loss": 0.1705268770456314, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "distill_loss": 0.09931408613920212, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "ref_ce_loss": 0.10927734524011612, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "loss": 0.6825743913650513, + "step": 14220 + }, + { + "ce_loss": 0.09274369478225708, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "distill_loss": 0.09510725736618042, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "ref_ce_loss": 0.07657185196876526, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "loss": 0.45636534690856934, + "step": 14220 + }, + { + "ce_loss": 0.1472557932138443, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "distill_loss": 0.13751322031021118, + "epoch": 4.743162108072048, + "step": 14220 + }, + { + "epoch": 4.743162108072048, + "ref_ce_loss": 0.08142144232988358, + "step": 14220 + }, + { + "epoch": 4.746497665110073, + "loss": 0.4579, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "grad_norm": 2.141514539718628, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "learning_rate": 0.00011236543260467418, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "loss": 0.521102249622345, + "step": 14230 + }, + { + "ce_loss": 0.203525111079216, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "distill_loss": 0.11754366755485535, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "ref_ce_loss": 0.19978143274784088, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "loss": 0.4507944583892822, + "step": 14230 + }, + { + "ce_loss": 0.0714416429400444, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "distill_loss": 0.11438420414924622, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "ref_ce_loss": 0.08590278029441833, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "loss": 0.3971126973628998, + "step": 14230 + }, + { + "ce_loss": 0.1454489678144455, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "distill_loss": 0.11768228560686111, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "ref_ce_loss": 0.13380113244056702, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "loss": 0.43891745805740356, + "step": 14230 + }, + { + "ce_loss": 0.1446937918663025, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "distill_loss": 0.12114118039608002, + "epoch": 4.746497665110073, + "step": 14230 + }, + { + "epoch": 4.746497665110073, + "ref_ce_loss": 0.07623746246099472, + "step": 14230 + }, + { + "epoch": 4.749833222148099, + "loss": 0.4922, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "grad_norm": 2.9477293491363525, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "learning_rate": 0.00011216938473362377, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "loss": 0.4097670912742615, + "step": 14240 + }, + { + "ce_loss": 0.11067419499158859, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "distill_loss": 0.15952913463115692, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "ref_ce_loss": 0.09814775735139847, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "loss": 0.5153831839561462, + "step": 14240 + }, + { + "ce_loss": 0.22324255108833313, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "distill_loss": 0.13816522061824799, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "ref_ce_loss": 0.1536806970834732, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "loss": 0.5292138457298279, + "step": 14240 + }, + { + "ce_loss": 0.12397529184818268, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "distill_loss": 0.1275257170200348, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "ref_ce_loss": 0.0806649923324585, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "loss": 0.43219509720802307, + "step": 14240 + }, + { + "ce_loss": 0.13407444953918457, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "distill_loss": 0.14914165437221527, + "epoch": 4.749833222148099, + "step": 14240 + }, + { + "epoch": 4.749833222148099, + "ref_ce_loss": 0.10060304403305054, + "step": 14240 + }, + { + "epoch": 4.753168779186124, + "loss": 0.4562, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "grad_norm": 2.0971434116363525, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "learning_rate": 0.00011197340585069259, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "loss": 0.4454609751701355, + "step": 14250 + }, + { + "ce_loss": 0.0977015420794487, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "distill_loss": 0.10412100702524185, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "ref_ce_loss": 0.10450634360313416, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "loss": 0.4546106457710266, + "step": 14250 + }, + { + "ce_loss": 0.18327759206295013, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "distill_loss": 0.11298847198486328, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "ref_ce_loss": 0.11655457317829132, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "loss": 0.6993808746337891, + "step": 14250 + }, + { + "ce_loss": 0.132548525929451, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "distill_loss": 0.18319010734558105, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "ref_ce_loss": 0.08631079643964767, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "loss": 0.4419463276863098, + "step": 14250 + }, + { + "ce_loss": 0.13755536079406738, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "distill_loss": 0.1329721212387085, + "epoch": 4.753168779186124, + "step": 14250 + }, + { + "epoch": 4.753168779186124, + "ref_ce_loss": 0.0995076596736908, + "step": 14250 + }, + { + "epoch": 4.7565043362241495, + "loss": 0.4859, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "grad_norm": 2.6352319717407227, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "learning_rate": 0.00011177749631326887, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "loss": 0.49517810344696045, + "step": 14260 + }, + { + "ce_loss": 0.1591547429561615, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "distill_loss": 0.15403160452842712, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "ref_ce_loss": 0.1262081265449524, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "loss": 0.6272794604301453, + "step": 14260 + }, + { + "ce_loss": 0.31204718351364136, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "distill_loss": 0.16110165417194366, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "ref_ce_loss": 0.13668222725391388, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "loss": 0.3994016945362091, + "step": 14260 + }, + { + "ce_loss": 0.09157441556453705, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "distill_loss": 0.13846012949943542, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "ref_ce_loss": 0.10564356297254562, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "loss": 1.257520318031311, + "step": 14260 + }, + { + "ce_loss": 0.2869233787059784, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "distill_loss": 0.22305631637573242, + "epoch": 4.7565043362241495, + "step": 14260 + }, + { + "epoch": 4.7565043362241495, + "ref_ce_loss": 0.13798722624778748, + "step": 14260 + }, + { + "epoch": 4.759839893262175, + "loss": 0.5491, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "grad_norm": 2.1373462677001953, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "learning_rate": 0.00011158165647861435, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "loss": 0.45616188645362854, + "step": 14270 + }, + { + "ce_loss": 0.20117731392383575, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "distill_loss": 0.11334202438592911, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "ref_ce_loss": 0.11822082102298737, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "loss": 0.5591301321983337, + "step": 14270 + }, + { + "ce_loss": 0.15160302817821503, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "distill_loss": 0.12180090695619583, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "ref_ce_loss": 0.13151346147060394, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "loss": 0.41168203949928284, + "step": 14270 + }, + { + "ce_loss": 0.12057756632566452, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "distill_loss": 0.12694385647773743, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "ref_ce_loss": 0.11435339599847794, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "loss": 0.39429712295532227, + "step": 14270 + }, + { + "ce_loss": 0.024822566658258438, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "distill_loss": 0.08758328855037689, + "epoch": 4.759839893262175, + "step": 14270 + }, + { + "epoch": 4.759839893262175, + "ref_ce_loss": 0.05988505855202675, + "step": 14270 + }, + { + "epoch": 4.7631754503002, + "loss": 0.502, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "grad_norm": 2.955759286880493, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "learning_rate": 0.00011138588670386358, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "loss": 0.3777836263179779, + "step": 14280 + }, + { + "ce_loss": 0.15252237021923065, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "distill_loss": 0.12933968007564545, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "ref_ce_loss": 0.09582728147506714, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "loss": 0.26405563950538635, + "step": 14280 + }, + { + "ce_loss": 0.09645365178585052, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "distill_loss": 0.09345416724681854, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "ref_ce_loss": 0.07391425222158432, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "loss": 0.20412151515483856, + "step": 14280 + }, + { + "ce_loss": 0.025568762794137, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "distill_loss": 0.07479839771986008, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "ref_ce_loss": 0.06605122238397598, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "loss": 0.3875410258769989, + "step": 14280 + }, + { + "ce_loss": 0.15600574016571045, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "distill_loss": 0.1275922656059265, + "epoch": 4.7631754503002, + "step": 14280 + }, + { + "epoch": 4.7631754503002, + "ref_ce_loss": 0.07461174577474594, + "step": 14280 + }, + { + "epoch": 4.7665110073382255, + "loss": 0.444, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "grad_norm": 2.552724838256836, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "learning_rate": 0.0001111901873460235, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "loss": 0.43322882056236267, + "step": 14290 + }, + { + "ce_loss": 0.17510177195072174, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "distill_loss": 0.13811074197292328, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "ref_ce_loss": 0.11976328492164612, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "loss": 0.5688863396644592, + "step": 14290 + }, + { + "ce_loss": 0.2690722942352295, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "distill_loss": 0.11816976964473724, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "ref_ce_loss": 0.14800193905830383, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "loss": 0.3511410355567932, + "step": 14290 + }, + { + "ce_loss": 0.13480216264724731, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "distill_loss": 0.10867208987474442, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "ref_ce_loss": 0.056814275681972504, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "loss": 0.7902196645736694, + "step": 14290 + }, + { + "ce_loss": 0.23585091531276703, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "distill_loss": 0.13959118723869324, + "epoch": 4.7665110073382255, + "step": 14290 + }, + { + "epoch": 4.7665110073382255, + "ref_ce_loss": 0.10354238748550415, + "step": 14290 + }, + { + "epoch": 4.769846564376251, + "loss": 0.4744, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "grad_norm": 3.3194260597229004, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "learning_rate": 0.0001109945587619724, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "loss": 0.504343569278717, + "step": 14300 + }, + { + "ce_loss": 0.16864703595638275, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "distill_loss": 0.14713454246520996, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "ref_ce_loss": 0.13997380435466766, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "loss": 0.3732161223888397, + "step": 14300 + }, + { + "ce_loss": 0.11053475737571716, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "distill_loss": 0.10585125535726547, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "ref_ce_loss": 0.06673439592123032, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "loss": 0.5125171542167664, + "step": 14300 + }, + { + "ce_loss": 0.2005772441625595, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "distill_loss": 0.1538941115140915, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "ref_ce_loss": 0.12865741550922394, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "loss": 0.5521990060806274, + "step": 14300 + }, + { + "ce_loss": 0.16355322301387787, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "distill_loss": 0.12216568738222122, + "epoch": 4.769846564376251, + "step": 14300 + }, + { + "epoch": 4.769846564376251, + "ref_ce_loss": 0.10937811434268951, + "step": 14300 + }, + { + "epoch": 4.773182121414276, + "loss": 0.4654, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "grad_norm": 1.955685019493103, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "learning_rate": 0.00011079900130845976, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "loss": 0.45663851499557495, + "step": 14310 + }, + { + "ce_loss": 0.13172826170921326, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "distill_loss": 0.10975062847137451, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "ref_ce_loss": 0.08388905972242355, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "loss": 0.642504870891571, + "step": 14310 + }, + { + "ce_loss": 0.125015527009964, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "distill_loss": 0.1834339052438736, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "ref_ce_loss": 0.1528361737728119, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "loss": 0.479583740234375, + "step": 14310 + }, + { + "ce_loss": 0.18668422102928162, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "distill_loss": 0.10831034928560257, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "ref_ce_loss": 0.13767296075820923, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "loss": 0.45591068267822266, + "step": 14310 + }, + { + "ce_loss": 0.1843109130859375, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "distill_loss": 0.16249501705169678, + "epoch": 4.773182121414276, + "step": 14310 + }, + { + "epoch": 4.773182121414276, + "ref_ce_loss": 0.10894269496202469, + "step": 14310 + }, + { + "epoch": 4.776517678452302, + "loss": 0.4768, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "grad_norm": 2.224634885787964, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "learning_rate": 0.00011060351534210522, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "loss": 0.36553362011909485, + "step": 14320 + }, + { + "ce_loss": 0.16818471252918243, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "distill_loss": 0.13930566608905792, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "ref_ce_loss": 0.05784922093153, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "loss": 0.361936092376709, + "step": 14320 + }, + { + "ce_loss": 0.09541445970535278, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "distill_loss": 0.11125258356332779, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "ref_ce_loss": 0.09528297185897827, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "loss": 0.40957027673721313, + "step": 14320 + }, + { + "ce_loss": 0.07518623024225235, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "distill_loss": 0.12784458696842194, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "ref_ce_loss": 0.07247889041900635, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "loss": 0.46657589077949524, + "step": 14320 + }, + { + "ce_loss": 0.11209922283887863, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "distill_loss": 0.13903628289699554, + "epoch": 4.776517678452302, + "step": 14320 + }, + { + "epoch": 4.776517678452302, + "ref_ce_loss": 0.1548924446105957, + "step": 14320 + }, + { + "epoch": 4.779853235490327, + "loss": 0.4814, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "grad_norm": 6.027482509613037, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "learning_rate": 0.00011040810121939803, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "loss": 0.42402875423431396, + "step": 14330 + }, + { + "ce_loss": 0.16902658343315125, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "distill_loss": 0.11825080960988998, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "ref_ce_loss": 0.08660327643156052, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "loss": 0.36984482407569885, + "step": 14330 + }, + { + "ce_loss": 0.1259082704782486, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "distill_loss": 0.13614816963672638, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "ref_ce_loss": 0.09089109301567078, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "loss": 0.39057061076164246, + "step": 14330 + }, + { + "ce_loss": 0.1435929536819458, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "distill_loss": 0.13623438775539398, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "ref_ce_loss": 0.08988281339406967, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "loss": 0.541616678237915, + "step": 14330 + }, + { + "ce_loss": 0.17811079323291779, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "distill_loss": 0.12328089773654938, + "epoch": 4.779853235490327, + "step": 14330 + }, + { + "epoch": 4.779853235490327, + "ref_ce_loss": 0.11967521905899048, + "step": 14330 + }, + { + "epoch": 4.783188792528352, + "loss": 0.43, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "grad_norm": 3.0198564529418945, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "learning_rate": 0.00011021275929669648, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "loss": 0.37297478318214417, + "step": 14340 + }, + { + "ce_loss": 0.10748539865016937, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "distill_loss": 0.141023188829422, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "ref_ce_loss": 0.12405706942081451, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "loss": 0.34351977705955505, + "step": 14340 + }, + { + "ce_loss": 0.09799874573945999, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "distill_loss": 0.1253485083580017, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "ref_ce_loss": 0.11973781138658524, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "loss": 0.991863489151001, + "step": 14340 + }, + { + "ce_loss": 0.24459530413150787, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "distill_loss": 0.13070832192897797, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "ref_ce_loss": 0.09614545851945877, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "loss": 0.5935628414154053, + "step": 14340 + }, + { + "ce_loss": 0.12343177199363708, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "distill_loss": 0.14855144917964935, + "epoch": 4.783188792528352, + "step": 14340 + }, + { + "epoch": 4.783188792528352, + "ref_ce_loss": 0.09137774258852005, + "step": 14340 + }, + { + "epoch": 4.786524349566378, + "loss": 0.5001, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "grad_norm": 2.282027006149292, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "learning_rate": 0.00011001748993022722, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "loss": 0.3946148157119751, + "step": 14350 + }, + { + "ce_loss": 0.07618418335914612, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "distill_loss": 0.15871164202690125, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "ref_ce_loss": 0.05771951004862785, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "loss": 0.450163871049881, + "step": 14350 + }, + { + "ce_loss": 0.07075124979019165, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "distill_loss": 0.17303964495658875, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "ref_ce_loss": 0.0855056643486023, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "loss": 0.49539071321487427, + "step": 14350 + }, + { + "ce_loss": 0.13586698472499847, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "distill_loss": 0.18673719465732574, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "ref_ce_loss": 0.14078141748905182, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "loss": 0.41358691453933716, + "step": 14350 + }, + { + "ce_loss": 0.12783963978290558, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "distill_loss": 0.1703040450811386, + "epoch": 4.786524349566378, + "step": 14350 + }, + { + "epoch": 4.786524349566378, + "ref_ce_loss": 0.0900261327624321, + "step": 14350 + }, + { + "epoch": 4.789859906604403, + "loss": 0.4901, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "grad_norm": 3.0800037384033203, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "learning_rate": 0.00010982229347608446, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "loss": 0.412725031375885, + "step": 14360 + }, + { + "ce_loss": 0.1197974905371666, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "distill_loss": 0.18120068311691284, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "ref_ce_loss": 0.08914639800786972, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "loss": 0.4203656017780304, + "step": 14360 + }, + { + "ce_loss": 0.08861410617828369, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "distill_loss": 0.1394980102777481, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "ref_ce_loss": 0.08281680941581726, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "loss": 0.4946492612361908, + "step": 14360 + }, + { + "ce_loss": 0.11306752264499664, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "distill_loss": 0.17083683609962463, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "ref_ce_loss": 0.1248176321387291, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "loss": 0.5337336659431458, + "step": 14360 + }, + { + "ce_loss": 0.166512593626976, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "distill_loss": 0.18739216029644012, + "epoch": 4.789859906604403, + "step": 14360 + }, + { + "epoch": 4.789859906604403, + "ref_ce_loss": 0.121116504073143, + "step": 14360 + }, + { + "epoch": 4.793195463642428, + "loss": 0.4662, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "grad_norm": 4.17993688583374, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "learning_rate": 0.00010962717029022967, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "loss": 0.46002838015556335, + "step": 14370 + }, + { + "ce_loss": 0.13797122240066528, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "distill_loss": 0.17406927049160004, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "ref_ce_loss": 0.12049631774425507, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "loss": 0.2839353084564209, + "step": 14370 + }, + { + "ce_loss": 0.06530182808637619, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "distill_loss": 0.14406593143939972, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "ref_ce_loss": 0.0744338408112526, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "loss": 0.5167728662490845, + "step": 14370 + }, + { + "ce_loss": 0.05427071079611778, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "distill_loss": 0.19449418783187866, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "ref_ce_loss": 0.10985428839921951, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "loss": 0.5486382246017456, + "step": 14370 + }, + { + "ce_loss": 0.2093065083026886, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "distill_loss": 0.19122518599033356, + "epoch": 4.793195463642428, + "step": 14370 + }, + { + "epoch": 4.793195463642428, + "ref_ce_loss": 0.08407463133335114, + "step": 14370 + }, + { + "epoch": 4.796531020680454, + "loss": 0.4607, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "grad_norm": 2.9548277854919434, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "learning_rate": 0.00010943212072849042, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "loss": 0.3544643819332123, + "step": 14380 + }, + { + "ce_loss": 0.07386091351509094, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "distill_loss": 0.10469326376914978, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "ref_ce_loss": 0.12491313368082047, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "loss": 0.4699355959892273, + "step": 14380 + }, + { + "ce_loss": 0.11637619882822037, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "distill_loss": 0.17287704348564148, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "ref_ce_loss": 0.07765893638134003, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "loss": 0.47565996646881104, + "step": 14380 + }, + { + "ce_loss": 0.1342860609292984, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "distill_loss": 0.1429038941860199, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "ref_ce_loss": 0.06292593479156494, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "loss": 0.35165899991989136, + "step": 14380 + }, + { + "ce_loss": 0.05011136829853058, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "distill_loss": 0.10166673362255096, + "epoch": 4.796531020680454, + "step": 14380 + }, + { + "epoch": 4.796531020680454, + "ref_ce_loss": 0.05985415354371071, + "step": 14380 + }, + { + "epoch": 4.799866577718479, + "loss": 0.4348, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "grad_norm": 1.8150907754898071, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "learning_rate": 0.00010923714514656023, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "loss": 0.4065036475658417, + "step": 14390 + }, + { + "ce_loss": 0.162733793258667, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "distill_loss": 0.15392005443572998, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "ref_ce_loss": 0.0887485072016716, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "loss": 0.3206755220890045, + "step": 14390 + }, + { + "ce_loss": 0.1351880133152008, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "distill_loss": 0.10881204158067703, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "ref_ce_loss": 0.07634250819683075, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "loss": 0.30655360221862793, + "step": 14390 + }, + { + "ce_loss": 0.07555373758077621, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "distill_loss": 0.1615779846906662, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "ref_ce_loss": 0.0693129375576973, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "loss": 0.3501516580581665, + "step": 14390 + }, + { + "ce_loss": 0.08676886558532715, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "distill_loss": 0.10866406559944153, + "epoch": 4.799866577718479, + "step": 14390 + }, + { + "epoch": 4.799866577718479, + "ref_ce_loss": 0.0950222760438919, + "step": 14390 + }, + { + "epoch": 4.803202134756504, + "loss": 0.4181, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "grad_norm": 2.890615463256836, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "learning_rate": 0.00010904224389999772, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "loss": 0.5171407461166382, + "step": 14400 + }, + { + "ce_loss": 0.07824192941188812, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "distill_loss": 0.1526021808385849, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "ref_ce_loss": 0.09420009702444077, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "loss": 0.36038991808891296, + "step": 14400 + }, + { + "ce_loss": 0.12154200673103333, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "distill_loss": 0.14514140784740448, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "ref_ce_loss": 0.07570644468069077, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "loss": 0.44446617364883423, + "step": 14400 + }, + { + "ce_loss": 0.155522882938385, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "distill_loss": 0.16018736362457275, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "ref_ce_loss": 0.09933032840490341, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "loss": 0.43344542384147644, + "step": 14400 + }, + { + "ce_loss": 0.11573299020528793, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "distill_loss": 0.1858031451702118, + "epoch": 4.803202134756504, + "step": 14400 + }, + { + "epoch": 4.803202134756504, + "ref_ce_loss": 0.13141514360904694, + "step": 14400 + }, + { + "epoch": 4.80653769179453, + "loss": 0.3817, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "grad_norm": 2.4496381282806396, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "learning_rate": 0.00010884741734422578, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "loss": 0.7833337783813477, + "step": 14410 + }, + { + "ce_loss": 0.11659690737724304, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "distill_loss": 0.12998512387275696, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "ref_ce_loss": 0.09738679975271225, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "loss": 0.38516169786453247, + "step": 14410 + }, + { + "ce_loss": 0.13952520489692688, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "distill_loss": 0.15259574353694916, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "ref_ce_loss": 0.09262451529502869, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "loss": 0.46893495321273804, + "step": 14410 + }, + { + "ce_loss": 0.10217462480068207, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "distill_loss": 0.18115559220314026, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "ref_ce_loss": 0.07953232526779175, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "loss": 0.3491207957267761, + "step": 14410 + }, + { + "ce_loss": 0.11116782575845718, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "distill_loss": 0.11189278960227966, + "epoch": 4.80653769179453, + "step": 14410 + }, + { + "epoch": 4.80653769179453, + "ref_ce_loss": 0.0821090117096901, + "step": 14410 + }, + { + "epoch": 4.809873248832555, + "loss": 0.4266, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "grad_norm": 3.1390187740325928, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "learning_rate": 0.00010865266583453127, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "loss": 0.22609755396842957, + "step": 14420 + }, + { + "ce_loss": 0.03655946999788284, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "distill_loss": 0.09298989176750183, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "ref_ce_loss": 0.0591839924454689, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "loss": 0.3537759780883789, + "step": 14420 + }, + { + "ce_loss": 0.06846825778484344, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "distill_loss": 0.10115021467208862, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "ref_ce_loss": 0.0874619260430336, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "loss": 0.3921999931335449, + "step": 14420 + }, + { + "ce_loss": 0.09556600451469421, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "distill_loss": 0.13231655955314636, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "ref_ce_loss": 0.05044102296233177, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "loss": 0.6851783990859985, + "step": 14420 + }, + { + "ce_loss": 0.20700956881046295, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "distill_loss": 0.2140367180109024, + "epoch": 4.809873248832555, + "step": 14420 + }, + { + "epoch": 4.809873248832555, + "ref_ce_loss": 0.15442690253257751, + "step": 14420 + }, + { + "epoch": 4.81320880587058, + "loss": 0.3931, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "grad_norm": 2.6131036281585693, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "learning_rate": 0.00010845798972606404, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "loss": 0.21408505737781525, + "step": 14430 + }, + { + "ce_loss": 0.028935782611370087, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "distill_loss": 0.11932484805583954, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "ref_ce_loss": 0.06558303534984589, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "loss": 1.2338591814041138, + "step": 14430 + }, + { + "ce_loss": 0.20345933735370636, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "distill_loss": 0.17151473462581635, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "ref_ce_loss": 0.11353405565023422, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "loss": 0.23042449355125427, + "step": 14430 + }, + { + "ce_loss": 0.08617976307868958, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "distill_loss": 0.09825091063976288, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "ref_ce_loss": 0.04588671773672104, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "loss": 0.2805485725402832, + "step": 14430 + }, + { + "ce_loss": 0.04742460697889328, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "distill_loss": 0.1001681461930275, + "epoch": 4.81320880587058, + "step": 14430 + }, + { + "epoch": 4.81320880587058, + "ref_ce_loss": 0.0729212611913681, + "step": 14430 + }, + { + "epoch": 4.816544362908606, + "loss": 0.4052, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "grad_norm": 6.898608684539795, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "learning_rate": 0.00010826338937383656, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "loss": 0.2627457082271576, + "step": 14440 + }, + { + "ce_loss": 0.05536928027868271, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "distill_loss": 0.11648040264844894, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "ref_ce_loss": 0.09046781063079834, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "loss": 0.5691077709197998, + "step": 14440 + }, + { + "ce_loss": 0.17502161860466003, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "distill_loss": 0.20319420099258423, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "ref_ce_loss": 0.09941479563713074, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "loss": 0.2933313846588135, + "step": 14440 + }, + { + "ce_loss": 0.04782741516828537, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "distill_loss": 0.11192759871482849, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "ref_ce_loss": 0.07869847863912582, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "loss": 0.32633116841316223, + "step": 14440 + }, + { + "ce_loss": 0.13065771758556366, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "distill_loss": 0.12477478384971619, + "epoch": 4.816544362908606, + "step": 14440 + }, + { + "epoch": 4.816544362908606, + "ref_ce_loss": 0.07074069231748581, + "step": 14440 + }, + { + "epoch": 4.819879919946631, + "loss": 0.4009, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "grad_norm": 1.7006696462631226, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "learning_rate": 0.00010806886513272319, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "loss": 0.5183548331260681, + "step": 14450 + }, + { + "ce_loss": 0.14206622540950775, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "distill_loss": 0.10981175303459167, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "ref_ce_loss": 0.10474380850791931, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "loss": 0.37155765295028687, + "step": 14450 + }, + { + "ce_loss": 0.10685473680496216, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "distill_loss": 0.12410110980272293, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "ref_ce_loss": 0.09817085415124893, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "loss": 0.5181396007537842, + "step": 14450 + }, + { + "ce_loss": 0.13136331737041473, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "distill_loss": 0.16278839111328125, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "ref_ce_loss": 0.11400038003921509, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "loss": 0.31124332547187805, + "step": 14450 + }, + { + "ce_loss": 0.08205088973045349, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "distill_loss": 0.17146164178848267, + "epoch": 4.819879919946631, + "step": 14450 + }, + { + "epoch": 4.819879919946631, + "ref_ce_loss": 0.05737442150712013, + "step": 14450 + }, + { + "epoch": 4.8232154769846565, + "loss": 0.386, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "grad_norm": 3.951909065246582, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "learning_rate": 0.00010787441735745924, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "loss": 0.47462722659111023, + "step": 14460 + }, + { + "ce_loss": 0.1653720587491989, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "distill_loss": 0.19095370173454285, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "ref_ce_loss": 0.07604599744081497, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "loss": 0.43112313747406006, + "step": 14460 + }, + { + "ce_loss": 0.08467692136764526, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "distill_loss": 0.12960241734981537, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "ref_ce_loss": 0.07011184096336365, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "loss": 0.42976704239845276, + "step": 14460 + }, + { + "ce_loss": 0.11868524551391602, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "distill_loss": 0.1772325485944748, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "ref_ce_loss": 0.09218654036521912, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "loss": 0.3322980999946594, + "step": 14460 + }, + { + "ce_loss": 0.09327730536460876, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "distill_loss": 0.11820797622203827, + "epoch": 4.8232154769846565, + "step": 14460 + }, + { + "epoch": 4.8232154769846565, + "ref_ce_loss": 0.12056463956832886, + "step": 14460 + }, + { + "epoch": 4.826551034022682, + "loss": 0.4343, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "grad_norm": 3.124202013015747, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "learning_rate": 0.00010768004640264087, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "loss": 0.5490837097167969, + "step": 14470 + }, + { + "ce_loss": 0.13129980862140656, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "distill_loss": 0.15426191687583923, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "ref_ce_loss": 0.09926041960716248, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "loss": 0.4295900762081146, + "step": 14470 + }, + { + "ce_loss": 0.07018239051103592, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "distill_loss": 0.19665493071079254, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "ref_ce_loss": 0.12419581413269043, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "loss": 0.445414662361145, + "step": 14470 + }, + { + "ce_loss": 0.15395063161849976, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "distill_loss": 0.16152891516685486, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "ref_ce_loss": 0.08925561606884003, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "loss": 0.37601250410079956, + "step": 14470 + }, + { + "ce_loss": 0.08798770606517792, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "distill_loss": 0.12041836231946945, + "epoch": 4.826551034022682, + "step": 14470 + }, + { + "epoch": 4.826551034022682, + "ref_ce_loss": 0.0818629190325737, + "step": 14470 + }, + { + "epoch": 4.829886591060707, + "loss": 0.4342, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "grad_norm": 2.767193078994751, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "learning_rate": 0.00010748575262272406, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "loss": 0.29959261417388916, + "step": 14480 + }, + { + "ce_loss": 0.08880975842475891, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "distill_loss": 0.13206779956817627, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "ref_ce_loss": 0.048692721873521805, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "loss": 0.5737348794937134, + "step": 14480 + }, + { + "ce_loss": 0.11966817826032639, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "distill_loss": 0.1964222639799118, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "ref_ce_loss": 0.07506822049617767, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "loss": 0.6172971725463867, + "step": 14480 + }, + { + "ce_loss": 0.15250805020332336, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "distill_loss": 0.179644376039505, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "ref_ce_loss": 0.09086277335882187, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "loss": 0.3475439250469208, + "step": 14480 + }, + { + "ce_loss": 0.06675507873296738, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "distill_loss": 0.14787790179252625, + "epoch": 4.829886591060707, + "step": 14480 + }, + { + "epoch": 4.829886591060707, + "ref_ce_loss": 0.0421852171421051, + "step": 14480 + }, + { + "epoch": 4.8332221480987325, + "loss": 0.3986, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "grad_norm": 1.9744664430618286, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "learning_rate": 0.00010729153637202389, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "loss": 0.5732771158218384, + "step": 14490 + }, + { + "ce_loss": 0.11101856082677841, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "distill_loss": 0.1596483588218689, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "ref_ce_loss": 0.13086585700511932, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "loss": 0.5659236907958984, + "step": 14490 + }, + { + "ce_loss": 0.10885528475046158, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "distill_loss": 0.13874022662639618, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "ref_ce_loss": 0.06971140950918198, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "loss": 0.26478707790374756, + "step": 14490 + }, + { + "ce_loss": 0.05931922420859337, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "distill_loss": 0.13022437691688538, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "ref_ce_loss": 0.0495450533926487, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "loss": 0.46827933192253113, + "step": 14490 + }, + { + "ce_loss": 0.07739992439746857, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "distill_loss": 0.14634524285793304, + "epoch": 4.8332221480987325, + "step": 14490 + }, + { + "epoch": 4.8332221480987325, + "ref_ce_loss": 0.05592044070363045, + "step": 14490 + }, + { + "epoch": 4.836557705136758, + "loss": 0.4342, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "grad_norm": 2.3469674587249756, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "learning_rate": 0.00010709739800471433, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "loss": 0.3189893662929535, + "step": 14500 + }, + { + "ce_loss": 0.07006145268678665, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "distill_loss": 0.12909801304340363, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "ref_ce_loss": 0.08902038633823395, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "loss": 0.4594857394695282, + "step": 14500 + }, + { + "ce_loss": 0.12085427343845367, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "distill_loss": 0.15103530883789062, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "ref_ce_loss": 0.09070585668087006, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "loss": 0.33890047669410706, + "step": 14500 + }, + { + "ce_loss": 0.09388411045074463, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "distill_loss": 0.12131661921739578, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "ref_ce_loss": 0.08646147698163986, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "loss": 0.3899589776992798, + "step": 14500 + }, + { + "ce_loss": 0.11472490429878235, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "distill_loss": 0.17198053002357483, + "epoch": 4.836557705136758, + "step": 14500 + }, + { + "epoch": 4.836557705136758, + "ref_ce_loss": 0.07948368787765503, + "step": 14500 + }, + { + "epoch": 4.839893262174783, + "loss": 0.3939, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "grad_norm": 2.1642537117004395, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "learning_rate": 0.00010690333787482708, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "loss": 0.5003763437271118, + "step": 14510 + }, + { + "ce_loss": 0.0803409069776535, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "distill_loss": 0.13409990072250366, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "ref_ce_loss": 0.10229332000017166, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "loss": 0.3218827545642853, + "step": 14510 + }, + { + "ce_loss": 0.12027572095394135, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "distill_loss": 0.12687276303768158, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "ref_ce_loss": 0.07463307678699493, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "loss": 0.43994343280792236, + "step": 14510 + }, + { + "ce_loss": 0.13173320889472961, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "distill_loss": 0.174768328666687, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "ref_ce_loss": 0.0962105318903923, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "loss": 0.3595537841320038, + "step": 14510 + }, + { + "ce_loss": 0.11435595899820328, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "distill_loss": 0.1392708569765091, + "epoch": 4.839893262174783, + "step": 14510 + }, + { + "epoch": 4.839893262174783, + "ref_ce_loss": 0.10565324127674103, + "step": 14510 + }, + { + "epoch": 4.843228819212809, + "loss": 0.4273, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "grad_norm": 4.729673385620117, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "learning_rate": 0.00010670935633625125, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "loss": 0.39400550723075867, + "step": 14520 + }, + { + "ce_loss": 0.12447620928287506, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "distill_loss": 0.1449531614780426, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "ref_ce_loss": 0.0862637460231781, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "loss": 0.3979857563972473, + "step": 14520 + }, + { + "ce_loss": 0.11020766198635101, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "distill_loss": 0.1860402375459671, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "ref_ce_loss": 0.07654957473278046, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "loss": 0.4634833335876465, + "step": 14520 + }, + { + "ce_loss": 0.12632647156715393, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "distill_loss": 0.14024150371551514, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "ref_ce_loss": 0.13895127177238464, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "loss": 0.2567285895347595, + "step": 14520 + }, + { + "ce_loss": 0.04383685067296028, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "distill_loss": 0.147134467959404, + "epoch": 4.843228819212809, + "step": 14520 + }, + { + "epoch": 4.843228819212809, + "ref_ce_loss": 0.04801378771662712, + "step": 14520 + }, + { + "epoch": 4.846564376250834, + "loss": 0.43, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "grad_norm": 3.172746419906616, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "learning_rate": 0.0001065154537427328, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "loss": 0.3927261233329773, + "step": 14530 + }, + { + "ce_loss": 0.09357485920190811, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "distill_loss": 0.11319194734096527, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "ref_ce_loss": 0.09334545582532883, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "loss": 0.3599247336387634, + "step": 14530 + }, + { + "ce_loss": 0.05117193982005119, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "distill_loss": 0.12257934361696243, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "ref_ce_loss": 0.05467826500535011, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "loss": 0.6843513250350952, + "step": 14530 + }, + { + "ce_loss": 0.10040271282196045, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "distill_loss": 0.14522188901901245, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "ref_ce_loss": 0.08244102448225021, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "loss": 0.4031752943992615, + "step": 14530 + }, + { + "ce_loss": 0.14085879921913147, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "distill_loss": 0.16135947406291962, + "epoch": 4.846564376250834, + "step": 14530 + }, + { + "epoch": 4.846564376250834, + "ref_ce_loss": 0.06957376003265381, + "step": 14530 + }, + { + "epoch": 4.849899933288859, + "loss": 0.4429, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "grad_norm": 3.005507230758667, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "learning_rate": 0.0001063216304478734, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "loss": 0.291361004114151, + "step": 14540 + }, + { + "ce_loss": 0.042861636728048325, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "distill_loss": 0.10996908694505692, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "ref_ce_loss": 0.09997054189443588, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "loss": 0.4132578372955322, + "step": 14540 + }, + { + "ce_loss": 0.11565080285072327, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "distill_loss": 0.18525856733322144, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "ref_ce_loss": 0.08986419439315796, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "loss": 0.5397231578826904, + "step": 14540 + }, + { + "ce_loss": 0.09195056557655334, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "distill_loss": 0.12949776649475098, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "ref_ce_loss": 0.1093427911400795, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "loss": 0.5233044028282166, + "step": 14540 + }, + { + "ce_loss": 0.1069001853466034, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "distill_loss": 0.17089305818080902, + "epoch": 4.849899933288859, + "step": 14540 + }, + { + "epoch": 4.849899933288859, + "ref_ce_loss": 0.11540444940328598, + "step": 14540 + }, + { + "epoch": 4.853235490326885, + "loss": 0.4654, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "grad_norm": 2.990715265274048, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "learning_rate": 0.00010612788680513038, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "loss": 0.40204113721847534, + "step": 14550 + }, + { + "ce_loss": 0.1541042923927307, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "distill_loss": 0.14879141747951508, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "ref_ce_loss": 0.09888723492622375, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "loss": 0.35200127959251404, + "step": 14550 + }, + { + "ce_loss": 0.11289265751838684, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "distill_loss": 0.15979474782943726, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "ref_ce_loss": 0.07536427676677704, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "loss": 0.3538520038127899, + "step": 14550 + }, + { + "ce_loss": 0.09295067936182022, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "distill_loss": 0.1400989592075348, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "ref_ce_loss": 0.08571676164865494, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "loss": 0.43285155296325684, + "step": 14550 + }, + { + "ce_loss": 0.09952595084905624, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "distill_loss": 0.1657617688179016, + "epoch": 4.853235490326885, + "step": 14550 + }, + { + "epoch": 4.853235490326885, + "ref_ce_loss": 0.09958847612142563, + "step": 14550 + }, + { + "epoch": 4.85657104736491, + "loss": 0.4291, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "grad_norm": 2.8867013454437256, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "learning_rate": 0.00010593422316781567, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "loss": 0.4896565079689026, + "step": 14560 + }, + { + "ce_loss": 0.1618562489748001, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "distill_loss": 0.15716411173343658, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "ref_ce_loss": 0.09468694031238556, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "loss": 0.4915570318698883, + "step": 14560 + }, + { + "ce_loss": 0.18240197002887726, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "distill_loss": 0.15938079357147217, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "ref_ce_loss": 0.10698254406452179, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "loss": 0.39248335361480713, + "step": 14560 + }, + { + "ce_loss": 0.11607158184051514, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "distill_loss": 0.15520218014717102, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "ref_ce_loss": 0.09117502719163895, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "loss": 0.3708113133907318, + "step": 14560 + }, + { + "ce_loss": 0.053829822689294815, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "distill_loss": 0.1012469157576561, + "epoch": 4.85657104736491, + "step": 14560 + }, + { + "epoch": 4.85657104736491, + "ref_ce_loss": 0.11660492420196533, + "step": 14560 + }, + { + "epoch": 4.859906604402935, + "loss": 0.4288, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "grad_norm": 2.4022624492645264, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "learning_rate": 0.00010574063988909538, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "loss": 0.3762693405151367, + "step": 14570 + }, + { + "ce_loss": 0.09208332747220993, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "distill_loss": 0.1544317752122879, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "ref_ce_loss": 0.0855952724814415, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "loss": 0.40494173765182495, + "step": 14570 + }, + { + "ce_loss": 0.09856753051280975, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "distill_loss": 0.13378620147705078, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "ref_ce_loss": 0.0777980238199234, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "loss": 0.31158962845802307, + "step": 14570 + }, + { + "ce_loss": 0.08274923264980316, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "distill_loss": 0.144447922706604, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "ref_ce_loss": 0.0840868353843689, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "loss": 0.3044297993183136, + "step": 14570 + }, + { + "ce_loss": 0.06392364948987961, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "distill_loss": 0.13459810614585876, + "epoch": 4.859906604402935, + "step": 14570 + }, + { + "epoch": 4.859906604402935, + "ref_ce_loss": 0.08571823686361313, + "step": 14570 + }, + { + "epoch": 4.863242161440961, + "loss": 0.3983, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "grad_norm": 2.0568363666534424, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "learning_rate": 0.00010554713732198905, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "loss": 0.6924458742141724, + "step": 14580 + }, + { + "ce_loss": 0.2798740565776825, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "distill_loss": 0.16391947865486145, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "ref_ce_loss": 0.15253989398479462, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "loss": 0.4279673397541046, + "step": 14580 + }, + { + "ce_loss": 0.11625529825687408, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "distill_loss": 0.13469599187374115, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "ref_ce_loss": 0.0896773636341095, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "loss": 0.3884398341178894, + "step": 14580 + }, + { + "ce_loss": 0.13770216703414917, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "distill_loss": 0.13542982935905457, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "ref_ce_loss": 0.09085490554571152, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "loss": 0.27767953276634216, + "step": 14580 + }, + { + "ce_loss": 0.06366454809904099, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "distill_loss": 0.09851067513227463, + "epoch": 4.863242161440961, + "step": 14580 + }, + { + "epoch": 4.863242161440961, + "ref_ce_loss": 0.06787992268800735, + "step": 14580 + }, + { + "epoch": 4.866577718478986, + "loss": 0.3815, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "grad_norm": 2.244171380996704, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "learning_rate": 0.000105353715819369, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "loss": 0.40847817063331604, + "step": 14590 + }, + { + "ce_loss": 0.14665713906288147, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "distill_loss": 0.11157718300819397, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "ref_ce_loss": 0.12388044595718384, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "loss": 0.39813506603240967, + "step": 14590 + }, + { + "ce_loss": 0.11959546059370041, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "distill_loss": 0.12179628759622574, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "ref_ce_loss": 0.11253191530704498, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "loss": 0.2991400361061096, + "step": 14590 + }, + { + "ce_loss": 0.06128731742501259, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "distill_loss": 0.1151864305138588, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "ref_ce_loss": 0.07057370245456696, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "loss": 0.39899754524230957, + "step": 14590 + }, + { + "ce_loss": 0.10590605437755585, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "distill_loss": 0.13386231660842896, + "epoch": 4.866577718478986, + "step": 14590 + }, + { + "epoch": 4.866577718478986, + "ref_ce_loss": 0.09403873980045319, + "step": 14590 + }, + { + "epoch": 4.869913275517011, + "loss": 0.3918, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "grad_norm": 3.4217731952667236, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "learning_rate": 0.00010516037573395978, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "loss": 0.3474060893058777, + "step": 14600 + }, + { + "ce_loss": 0.07884716987609863, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "distill_loss": 0.13276150822639465, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "ref_ce_loss": 0.08149795234203339, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "loss": 0.4022737741470337, + "step": 14600 + }, + { + "ce_loss": 0.16848134994506836, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "distill_loss": 0.11421038955450058, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "ref_ce_loss": 0.10647285729646683, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "loss": 0.6741193532943726, + "step": 14600 + }, + { + "ce_loss": 0.1266450732946396, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "distill_loss": 0.14333488047122955, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "ref_ce_loss": 0.08235146850347519, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "loss": 0.28810229897499084, + "step": 14600 + }, + { + "ce_loss": 0.0736110657453537, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "distill_loss": 0.11870429664850235, + "epoch": 4.869913275517011, + "step": 14600 + }, + { + "epoch": 4.869913275517011, + "ref_ce_loss": 0.048248760402202606, + "step": 14600 + }, + { + "epoch": 4.873248832555037, + "loss": 0.4, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "grad_norm": 2.2944371700286865, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "learning_rate": 0.00010496711741833745, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "loss": 0.39695531129837036, + "step": 14610 + }, + { + "ce_loss": 0.07801104336977005, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "distill_loss": 0.10177255421876907, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "ref_ce_loss": 0.08016975969076157, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "loss": 0.46816474199295044, + "step": 14610 + }, + { + "ce_loss": 0.15950001776218414, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "distill_loss": 0.14450381696224213, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "ref_ce_loss": 0.13032293319702148, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "loss": 0.3655878007411957, + "step": 14610 + }, + { + "ce_loss": 0.0782846063375473, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "distill_loss": 0.11802016943693161, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "ref_ce_loss": 0.07457852363586426, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "loss": 0.2640596926212311, + "step": 14610 + }, + { + "ce_loss": 0.06261890381574631, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "distill_loss": 0.1388099044561386, + "epoch": 4.873248832555037, + "step": 14610 + }, + { + "epoch": 4.873248832555037, + "ref_ce_loss": 0.06244688853621483, + "step": 14610 + }, + { + "epoch": 4.876584389593062, + "loss": 0.3597, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "grad_norm": 8.353504180908203, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "learning_rate": 0.0001047739412249289, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "loss": 0.35462868213653564, + "step": 14620 + }, + { + "ce_loss": 0.05959922820329666, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "distill_loss": 0.09392604976892471, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "ref_ce_loss": 0.06135998293757439, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "loss": 0.4801032543182373, + "step": 14620 + }, + { + "ce_loss": 0.1412152349948883, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "distill_loss": 0.15411214530467987, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "ref_ce_loss": 0.08248871564865112, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "loss": 0.5009456872940063, + "step": 14620 + }, + { + "ce_loss": 0.06161312758922577, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "distill_loss": 0.09107838571071625, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "ref_ce_loss": 0.05931251123547554, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "loss": 0.29225414991378784, + "step": 14620 + }, + { + "ce_loss": 0.06127912178635597, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "distill_loss": 0.12588757276535034, + "epoch": 4.876584389593062, + "step": 14620 + }, + { + "epoch": 4.876584389593062, + "ref_ce_loss": 0.10451364517211914, + "step": 14620 + }, + { + "epoch": 4.879919946631087, + "loss": 0.3968, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "grad_norm": 2.361440420150757, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "learning_rate": 0.00010458084750601137, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "loss": 0.34836554527282715, + "step": 14630 + }, + { + "ce_loss": 0.1210658922791481, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "distill_loss": 0.11962322890758514, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "ref_ce_loss": 0.10724155604839325, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "loss": 0.42567917704582214, + "step": 14630 + }, + { + "ce_loss": 0.08495273441076279, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "distill_loss": 0.09693010151386261, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "ref_ce_loss": 0.08949395269155502, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "loss": 0.4357303977012634, + "step": 14630 + }, + { + "ce_loss": 0.0803510919213295, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "distill_loss": 0.13041549921035767, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "ref_ce_loss": 0.12345951050519943, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "loss": 0.26853373646736145, + "step": 14630 + }, + { + "ce_loss": 0.05560803785920143, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "distill_loss": 0.10235027968883514, + "epoch": 4.879919946631087, + "step": 14630 + }, + { + "epoch": 4.879919946631087, + "ref_ce_loss": 0.11022245138883591, + "step": 14630 + }, + { + "epoch": 4.883255503669113, + "loss": 0.3832, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "grad_norm": 2.1481926441192627, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "learning_rate": 0.00010438783661371154, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "loss": 0.35740113258361816, + "step": 14640 + }, + { + "ce_loss": 0.091608926653862, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "distill_loss": 0.10623601078987122, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "ref_ce_loss": 0.09509522467851639, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "loss": 0.4130536615848541, + "step": 14640 + }, + { + "ce_loss": 0.11907906085252762, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "distill_loss": 0.09680334478616714, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "ref_ce_loss": 0.10953420400619507, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "loss": 0.33304738998413086, + "step": 14640 + }, + { + "ce_loss": 0.0939435064792633, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "distill_loss": 0.12896057963371277, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "ref_ce_loss": 0.0519283190369606, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "loss": 0.2252245545387268, + "step": 14640 + }, + { + "ce_loss": 0.054917193949222565, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "distill_loss": 0.0945071280002594, + "epoch": 4.883255503669113, + "step": 14640 + }, + { + "epoch": 4.883255503669113, + "ref_ce_loss": 0.07565527409315109, + "step": 14640 + }, + { + "epoch": 4.886591060707138, + "loss": 0.3909, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "grad_norm": 2.6093320846557617, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "learning_rate": 0.00010419490890000523, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "loss": 0.2625221908092499, + "step": 14650 + }, + { + "ce_loss": 0.07245766371488571, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "distill_loss": 0.11005422472953796, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "ref_ce_loss": 0.07978059351444244, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "loss": 0.3463577330112457, + "step": 14650 + }, + { + "ce_loss": 0.11487383395433426, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "distill_loss": 0.12385935336351395, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "ref_ce_loss": 0.08501281589269638, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "loss": 0.4161044955253601, + "step": 14650 + }, + { + "ce_loss": 0.14107713103294373, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "distill_loss": 0.13545288145542145, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "ref_ce_loss": 0.09442567825317383, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "loss": 0.407216340303421, + "step": 14650 + }, + { + "ce_loss": 0.09006106853485107, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "distill_loss": 0.10209393501281738, + "epoch": 4.886591060707138, + "step": 14650 + }, + { + "epoch": 4.886591060707138, + "ref_ce_loss": 0.09535461664199829, + "step": 14650 + }, + { + "epoch": 4.8899266177451635, + "loss": 0.4326, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "grad_norm": 2.7057816982269287, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "learning_rate": 0.00010400206471671645, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "loss": 0.48617222905158997, + "step": 14660 + }, + { + "ce_loss": 0.16330264508724213, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "distill_loss": 0.13402962684631348, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "ref_ce_loss": 0.1386226862668991, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "loss": 0.5076068639755249, + "step": 14660 + }, + { + "ce_loss": 0.12294797599315643, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "distill_loss": 0.1680523306131363, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "ref_ce_loss": 0.1611853539943695, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "loss": 0.42812928557395935, + "step": 14660 + }, + { + "ce_loss": 0.10041183978319168, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "distill_loss": 0.11321305483579636, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "ref_ce_loss": 0.07629314810037613, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "loss": 0.25367259979248047, + "step": 14660 + }, + { + "ce_loss": 0.06785446405410767, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "distill_loss": 0.09600482881069183, + "epoch": 4.8899266177451635, + "step": 14660 + }, + { + "epoch": 4.8899266177451635, + "ref_ce_loss": 0.06254395842552185, + "step": 14660 + }, + { + "epoch": 4.893262174783189, + "loss": 0.4031, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "grad_norm": 2.916531562805176, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "learning_rate": 0.00010380930441551692, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "loss": 0.9963822364807129, + "step": 14670 + }, + { + "ce_loss": 0.1517850160598755, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "distill_loss": 0.11565428227186203, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "ref_ce_loss": 0.07845375686883926, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "loss": 0.36983203887939453, + "step": 14670 + }, + { + "ce_loss": 0.07163172960281372, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "distill_loss": 0.1265677660703659, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "ref_ce_loss": 0.08316890150308609, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "loss": 0.2734651267528534, + "step": 14670 + }, + { + "ce_loss": 0.08729465305805206, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "distill_loss": 0.10305607318878174, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "ref_ce_loss": 0.06024034693837166, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "loss": 0.3107224404811859, + "step": 14670 + }, + { + "ce_loss": 0.0668327808380127, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "distill_loss": 0.1244446337223053, + "epoch": 4.893262174783189, + "step": 14670 + }, + { + "epoch": 4.893262174783189, + "ref_ce_loss": 0.05027075111865997, + "step": 14670 + }, + { + "epoch": 4.896597731821214, + "loss": 0.4134, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "grad_norm": 3.499980926513672, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "learning_rate": 0.00010361662834792541, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "loss": 0.6011641025543213, + "step": 14680 + }, + { + "ce_loss": 0.06787215173244476, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "distill_loss": 0.10903525352478027, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "ref_ce_loss": 0.07811160385608673, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "loss": 0.3264457583427429, + "step": 14680 + }, + { + "ce_loss": 0.11346080899238586, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "distill_loss": 0.08637680113315582, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "ref_ce_loss": 0.09686467051506042, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "loss": 0.577754020690918, + "step": 14680 + }, + { + "ce_loss": 0.10299709439277649, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "distill_loss": 0.12928466498851776, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "ref_ce_loss": 0.15525569021701813, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "loss": 0.3560793995857239, + "step": 14680 + }, + { + "ce_loss": 0.11099941283464432, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "distill_loss": 0.11336791515350342, + "epoch": 4.896597731821214, + "step": 14680 + }, + { + "epoch": 4.896597731821214, + "ref_ce_loss": 0.09310568124055862, + "step": 14680 + }, + { + "epoch": 4.8999332888592395, + "loss": 0.4816, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "grad_norm": 2.3682327270507812, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "learning_rate": 0.00010342403686530702, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "loss": 0.3210378587245941, + "step": 14690 + }, + { + "ce_loss": 0.09722641110420227, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "distill_loss": 0.10519842803478241, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "ref_ce_loss": 0.11840209364891052, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "loss": 0.33558833599090576, + "step": 14690 + }, + { + "ce_loss": 0.10728945583105087, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "distill_loss": 0.1204787865281105, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "ref_ce_loss": 0.10764899104833603, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "loss": 0.369645893573761, + "step": 14690 + }, + { + "ce_loss": 0.0854637622833252, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "distill_loss": 0.0883425697684288, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "ref_ce_loss": 0.07716590911149979, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "loss": 0.5961611866950989, + "step": 14690 + }, + { + "ce_loss": 0.15953610837459564, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "distill_loss": 0.14236530661582947, + "epoch": 4.8999332888592395, + "step": 14690 + }, + { + "epoch": 4.8999332888592395, + "ref_ce_loss": 0.1169867068529129, + "step": 14690 + }, + { + "epoch": 4.903268845897265, + "loss": 0.4148, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "grad_norm": 3.376802444458008, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "learning_rate": 0.00010323153031887267, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "loss": 0.34086325764656067, + "step": 14700 + }, + { + "ce_loss": 0.09868812561035156, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "distill_loss": 0.09084837138652802, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "ref_ce_loss": 0.07390137016773224, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "loss": 0.4819284975528717, + "step": 14700 + }, + { + "ce_loss": 0.13006484508514404, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "distill_loss": 0.1349356472492218, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "ref_ce_loss": 0.11490657180547714, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "loss": 0.7943411469459534, + "step": 14700 + }, + { + "ce_loss": 0.1040106862783432, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "distill_loss": 0.11534878611564636, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "ref_ce_loss": 0.11220097541809082, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "loss": 0.3427712619304657, + "step": 14700 + }, + { + "ce_loss": 0.11044802516698837, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "distill_loss": 0.12651844322681427, + "epoch": 4.903268845897265, + "step": 14700 + }, + { + "epoch": 4.903268845897265, + "ref_ce_loss": 0.10573562979698181, + "step": 14700 + }, + { + "epoch": 4.90660440293529, + "loss": 0.4514, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "grad_norm": 2.766533851623535, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "learning_rate": 0.0001030391090596784, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "loss": 0.2449745088815689, + "step": 14710 + }, + { + "ce_loss": 0.0558619424700737, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "distill_loss": 0.10561040788888931, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "ref_ce_loss": 0.08319824934005737, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "loss": 0.3997700810432434, + "step": 14710 + }, + { + "ce_loss": 0.13185864686965942, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "distill_loss": 0.11414431780576706, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "ref_ce_loss": 0.07824277877807617, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "loss": 0.6474665403366089, + "step": 14710 + }, + { + "ce_loss": 0.14329788088798523, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "distill_loss": 0.12736766040325165, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "ref_ce_loss": 0.07725608348846436, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "loss": 0.2920524775981903, + "step": 14710 + }, + { + "ce_loss": 0.11028847098350525, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "distill_loss": 0.10173401981592178, + "epoch": 4.90660440293529, + "step": 14710 + }, + { + "epoch": 4.90660440293529, + "ref_ce_loss": 0.06018834933638573, + "step": 14710 + }, + { + "epoch": 4.909939959973316, + "loss": 0.4032, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "grad_norm": 2.0447373390197754, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "learning_rate": 0.00010284677343862461, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "loss": 0.5206084251403809, + "step": 14720 + }, + { + "ce_loss": 0.11765378713607788, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "distill_loss": 0.11574839055538177, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "ref_ce_loss": 0.09627361595630646, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "loss": 0.26153698563575745, + "step": 14720 + }, + { + "ce_loss": 0.056028977036476135, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "distill_loss": 0.12160011380910873, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "ref_ce_loss": 0.062378816306591034, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "loss": 0.40411004424095154, + "step": 14720 + }, + { + "ce_loss": 0.09394532442092896, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "distill_loss": 0.12955696880817413, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "ref_ce_loss": 0.11500399559736252, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "loss": 0.8402743339538574, + "step": 14720 + }, + { + "ce_loss": 0.08294497430324554, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "distill_loss": 0.10579205304384232, + "epoch": 4.909939959973316, + "step": 14720 + }, + { + "epoch": 4.909939959973316, + "ref_ce_loss": 0.0730375424027443, + "step": 14720 + }, + { + "epoch": 4.913275517011341, + "loss": 0.375, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "grad_norm": 2.3185336589813232, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "learning_rate": 0.0001026545238064557, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "loss": 0.31869810819625854, + "step": 14730 + }, + { + "ce_loss": 0.06146164610981941, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "distill_loss": 0.10617264360189438, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "ref_ce_loss": 0.11182975023984909, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "loss": 0.5334634184837341, + "step": 14730 + }, + { + "ce_loss": 0.09403307735919952, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "distill_loss": 0.105626180768013, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "ref_ce_loss": 0.09046263247728348, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "loss": 0.4704914391040802, + "step": 14730 + }, + { + "ce_loss": 0.14943642914295197, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "distill_loss": 0.12391936779022217, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "ref_ce_loss": 0.08655329793691635, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "loss": 0.8025875091552734, + "step": 14730 + }, + { + "ce_loss": 0.19269277155399323, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "distill_loss": 0.1386529505252838, + "epoch": 4.913275517011341, + "step": 14730 + }, + { + "epoch": 4.913275517011341, + "ref_ce_loss": 0.11787714064121246, + "step": 14730 + }, + { + "epoch": 4.916611074049366, + "loss": 0.3848, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "grad_norm": 1.9901200532913208, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "learning_rate": 0.00010246236051375899, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "loss": 0.35475262999534607, + "step": 14740 + }, + { + "ce_loss": 0.1351785510778427, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "distill_loss": 0.10362657904624939, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "ref_ce_loss": 0.11586165428161621, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "loss": 0.3570358157157898, + "step": 14740 + }, + { + "ce_loss": 0.06367281079292297, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "distill_loss": 0.10224097222089767, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "ref_ce_loss": 0.09009752422571182, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "loss": 0.25970083475112915, + "step": 14740 + }, + { + "ce_loss": 0.08381228893995285, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "distill_loss": 0.08009132742881775, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "ref_ce_loss": 0.09562067687511444, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "loss": 0.31035858392715454, + "step": 14740 + }, + { + "ce_loss": 0.03239971399307251, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "distill_loss": 0.0916653499007225, + "epoch": 4.916611074049366, + "step": 14740 + }, + { + "epoch": 4.916611074049366, + "ref_ce_loss": 0.07490362226963043, + "step": 14740 + }, + { + "epoch": 4.919946631087392, + "loss": 0.3686, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "grad_norm": 1.433788776397705, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "learning_rate": 0.00010227028391096469, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "loss": 0.3197961151599884, + "step": 14750 + }, + { + "ce_loss": 0.13021771609783173, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "distill_loss": 0.1070857048034668, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "ref_ce_loss": 0.0823674276471138, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "loss": 0.4557018280029297, + "step": 14750 + }, + { + "ce_loss": 0.17964871227741241, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "distill_loss": 0.14461633563041687, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "ref_ce_loss": 0.09955228120088577, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "loss": 0.3339523375034332, + "step": 14750 + }, + { + "ce_loss": 0.07346391677856445, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "distill_loss": 0.11796528100967407, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "ref_ce_loss": 0.08985766023397446, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "loss": 0.24510349333286285, + "step": 14750 + }, + { + "ce_loss": 0.04468422010540962, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "distill_loss": 0.09657389670610428, + "epoch": 4.919946631087392, + "step": 14750 + }, + { + "epoch": 4.919946631087392, + "ref_ce_loss": 0.08417684584856033, + "step": 14750 + }, + { + "epoch": 4.923282188125417, + "loss": 0.388, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "grad_norm": 2.640810012817383, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "learning_rate": 0.00010207829434834476, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "loss": 0.5349435806274414, + "step": 14760 + }, + { + "ce_loss": 0.09291098266839981, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "distill_loss": 0.14082714915275574, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "ref_ce_loss": 0.1280236840248108, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "loss": 0.6028576493263245, + "step": 14760 + }, + { + "ce_loss": 0.08771508932113647, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "distill_loss": 0.097988560795784, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "ref_ce_loss": 0.09704327583312988, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "loss": 0.3179771900177002, + "step": 14760 + }, + { + "ce_loss": 0.08840013295412064, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "distill_loss": 0.11706671863794327, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "ref_ce_loss": 0.08572734147310257, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "loss": 0.27515414357185364, + "step": 14760 + }, + { + "ce_loss": 0.07249299436807632, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "distill_loss": 0.10312556475400925, + "epoch": 4.923282188125417, + "step": 14760 + }, + { + "epoch": 4.923282188125417, + "ref_ce_loss": 0.06152420863509178, + "step": 14760 + }, + { + "epoch": 4.926617745163442, + "loss": 0.4136, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "grad_norm": 3.415719509124756, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "learning_rate": 0.00010188639217601227, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "loss": 0.3267880976200104, + "step": 14770 + }, + { + "ce_loss": 0.0738874077796936, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "distill_loss": 0.13988655805587769, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "ref_ce_loss": 0.0890965461730957, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "loss": 0.607414960861206, + "step": 14770 + }, + { + "ce_loss": 0.05598186329007149, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "distill_loss": 0.1434912234544754, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "ref_ce_loss": 0.0997924655675888, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "loss": 0.3053884506225586, + "step": 14770 + }, + { + "ce_loss": 0.08139229565858841, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "distill_loss": 0.10093171149492264, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "ref_ce_loss": 0.12291386723518372, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "loss": 0.3276132047176361, + "step": 14770 + }, + { + "ce_loss": 0.0874326154589653, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "distill_loss": 0.14592541754245758, + "epoch": 4.926617745163442, + "step": 14770 + }, + { + "epoch": 4.926617745163442, + "ref_ce_loss": 0.0702369436621666, + "step": 14770 + }, + { + "epoch": 4.929953302201468, + "loss": 0.4035, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "grad_norm": 2.4359805583953857, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "learning_rate": 0.00010169457774392122, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "loss": 0.3487001061439514, + "step": 14780 + }, + { + "ce_loss": 0.09250488132238388, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "distill_loss": 0.11125314980745316, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "ref_ce_loss": 0.09922449290752411, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "loss": 0.40673667192459106, + "step": 14780 + }, + { + "ce_loss": 0.10500854253768921, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "distill_loss": 0.11264103651046753, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "ref_ce_loss": 0.0795854777097702, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "loss": 0.32172906398773193, + "step": 14780 + }, + { + "ce_loss": 0.1017569825053215, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "distill_loss": 0.099338598549366, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "ref_ce_loss": 0.09416161477565765, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "loss": 0.3082839250564575, + "step": 14780 + }, + { + "ce_loss": 0.08893798291683197, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "distill_loss": 0.12213946878910065, + "epoch": 4.929953302201468, + "step": 14780 + }, + { + "epoch": 4.929953302201468, + "ref_ce_loss": 0.07671026140451431, + "step": 14780 + }, + { + "epoch": 4.933288859239493, + "loss": 0.3804, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "grad_norm": 3.638474941253662, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "learning_rate": 0.00010150285140186546, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "loss": 0.41449302434921265, + "step": 14790 + }, + { + "ce_loss": 0.04984421283006668, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "distill_loss": 0.09936082363128662, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "ref_ce_loss": 0.1158912256360054, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "loss": 0.5383901000022888, + "step": 14790 + }, + { + "ce_loss": 0.23258216679096222, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "distill_loss": 0.1699237823486328, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "ref_ce_loss": 0.10595542937517166, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "loss": 0.3543599247932434, + "step": 14790 + }, + { + "ce_loss": 0.14072558283805847, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "distill_loss": 0.11986932903528214, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "ref_ce_loss": 0.0930936187505722, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "loss": 0.31760847568511963, + "step": 14790 + }, + { + "ce_loss": 0.0677962526679039, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "distill_loss": 0.09178701788187027, + "epoch": 4.933288859239493, + "step": 14790 + }, + { + "epoch": 4.933288859239493, + "ref_ce_loss": 0.11888806521892548, + "step": 14790 + }, + { + "epoch": 4.936624416277518, + "loss": 0.3989, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "grad_norm": 5.042342662811279, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "learning_rate": 0.00010131121349947811, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "loss": 0.5192350745201111, + "step": 14800 + }, + { + "ce_loss": 0.22760631144046783, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "distill_loss": 0.13159260153770447, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "ref_ce_loss": 0.12293851375579834, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "loss": 0.3765476942062378, + "step": 14800 + }, + { + "ce_loss": 0.08823811262845993, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "distill_loss": 0.09266003221273422, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "ref_ce_loss": 0.08648016303777695, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "loss": 0.44473010301589966, + "step": 14800 + }, + { + "ce_loss": 0.1467081606388092, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "distill_loss": 0.12626172602176666, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "ref_ce_loss": 0.11209473758935928, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "loss": 0.2532784938812256, + "step": 14800 + }, + { + "ce_loss": 0.08552855998277664, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "distill_loss": 0.09357450902462006, + "epoch": 4.936624416277518, + "step": 14800 + }, + { + "epoch": 4.936624416277518, + "ref_ce_loss": 0.07390519231557846, + "step": 14800 + }, + { + "epoch": 4.939959973315544, + "loss": 0.3884, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "grad_norm": 2.153294801712036, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "learning_rate": 0.00010111966438623127, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "loss": 0.2643769681453705, + "step": 14810 + }, + { + "ce_loss": 0.0686085894703865, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "distill_loss": 0.09371940046548843, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "ref_ce_loss": 0.07681851089000702, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "loss": 0.5107899904251099, + "step": 14810 + }, + { + "ce_loss": 0.058815546333789825, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "distill_loss": 0.08406341820955276, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "ref_ce_loss": 0.09192143380641937, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "loss": 0.436542809009552, + "step": 14810 + }, + { + "ce_loss": 0.1103186309337616, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "distill_loss": 0.10437675565481186, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "ref_ce_loss": 0.11292348057031631, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "loss": 0.6722804307937622, + "step": 14810 + }, + { + "ce_loss": 0.1129743903875351, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "distill_loss": 0.13765385746955872, + "epoch": 4.939959973315544, + "step": 14810 + }, + { + "epoch": 4.939959973315544, + "ref_ce_loss": 0.13108649849891663, + "step": 14810 + }, + { + "epoch": 4.943295530353569, + "loss": 0.4369, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "grad_norm": 3.259122133255005, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "learning_rate": 0.00010092820441143482, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "loss": 0.2509721517562866, + "step": 14820 + }, + { + "ce_loss": 0.07367528975009918, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "distill_loss": 0.0924886018037796, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "ref_ce_loss": 0.08422941714525223, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "loss": 0.3739933669567108, + "step": 14820 + }, + { + "ce_loss": 0.14804257452487946, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "distill_loss": 0.10976754873991013, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "ref_ce_loss": 0.11567100137472153, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "loss": 0.4644673466682434, + "step": 14820 + }, + { + "ce_loss": 0.14605847001075745, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "distill_loss": 0.08814170211553574, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "ref_ce_loss": 0.09028080850839615, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "loss": 0.42085397243499756, + "step": 14820 + }, + { + "ce_loss": 0.15428964793682098, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "distill_loss": 0.12081962078809738, + "epoch": 4.943295530353569, + "step": 14820 + }, + { + "epoch": 4.943295530353569, + "ref_ce_loss": 0.09920281916856766, + "step": 14820 + }, + { + "epoch": 4.946631087391594, + "loss": 0.3485, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "grad_norm": 2.371697187423706, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "learning_rate": 0.00010073683392423623, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "loss": 0.19433481991291046, + "step": 14830 + }, + { + "ce_loss": 0.034791141748428345, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "distill_loss": 0.09125552326440811, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "ref_ce_loss": 0.06818924844264984, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "loss": 0.31020668148994446, + "step": 14830 + }, + { + "ce_loss": 0.09354670345783234, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "distill_loss": 0.10771866887807846, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "ref_ce_loss": 0.07463540136814117, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "loss": 0.5806359648704529, + "step": 14830 + }, + { + "ce_loss": 0.14421100914478302, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "distill_loss": 0.11986065655946732, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "ref_ce_loss": 0.11439379304647446, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "loss": 0.21567516028881073, + "step": 14830 + }, + { + "ce_loss": 0.028816038742661476, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "distill_loss": 0.10529807209968567, + "epoch": 4.946631087391594, + "step": 14830 + }, + { + "epoch": 4.946631087391594, + "ref_ce_loss": 0.0813959464430809, + "step": 14830 + }, + { + "epoch": 4.94996664442962, + "loss": 0.3416, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "grad_norm": 4.2593994140625, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "learning_rate": 0.00010054555327361993, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "loss": 0.19674266874790192, + "step": 14840 + }, + { + "ce_loss": 0.033592235296964645, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "distill_loss": 0.09351490437984467, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "ref_ce_loss": 0.06934471428394318, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "loss": 0.48189353942871094, + "step": 14840 + }, + { + "ce_loss": 0.10756346583366394, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "distill_loss": 0.10618402063846588, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "ref_ce_loss": 0.1061076894402504, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "loss": 0.4971357583999634, + "step": 14840 + }, + { + "ce_loss": 0.13811559975147247, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "distill_loss": 0.12884469330310822, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "ref_ce_loss": 0.11955190449953079, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "loss": 0.6511322855949402, + "step": 14840 + }, + { + "ce_loss": 0.184445321559906, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "distill_loss": 0.15705184638500214, + "epoch": 4.94996664442962, + "step": 14840 + }, + { + "epoch": 4.94996664442962, + "ref_ce_loss": 0.15733648836612701, + "step": 14840 + }, + { + "epoch": 4.953302201467645, + "loss": 0.4032, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "grad_norm": 3.079538106918335, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "learning_rate": 0.00010035436280840621, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "loss": 0.2901090383529663, + "step": 14850 + }, + { + "ce_loss": 0.03799564763903618, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "distill_loss": 0.10415848344564438, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "ref_ce_loss": 0.09406749904155731, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "loss": 0.3220950961112976, + "step": 14850 + }, + { + "ce_loss": 0.11926550418138504, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "distill_loss": 0.11953283846378326, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "ref_ce_loss": 0.08315497636795044, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "loss": 0.4622819125652313, + "step": 14850 + }, + { + "ce_loss": 0.1031547486782074, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "distill_loss": 0.09223375469446182, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "ref_ce_loss": 0.10606669634580612, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "loss": 0.33610832691192627, + "step": 14850 + }, + { + "ce_loss": 0.06804166734218597, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "distill_loss": 0.10635029524564743, + "epoch": 4.953302201467645, + "step": 14850 + }, + { + "epoch": 4.953302201467645, + "ref_ce_loss": 0.07996688038110733, + "step": 14850 + }, + { + "epoch": 4.9566377585056705, + "loss": 0.381, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "grad_norm": 2.194052219390869, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "learning_rate": 0.00010016326287725116, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "loss": 0.3040575087070465, + "step": 14860 + }, + { + "ce_loss": 0.08786673098802567, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "distill_loss": 0.14260633289813995, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "ref_ce_loss": 0.0735347643494606, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "loss": 0.8339674472808838, + "step": 14860 + }, + { + "ce_loss": 0.1228819414973259, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "distill_loss": 0.15760846436023712, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "ref_ce_loss": 0.10669492930173874, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "loss": 0.3304751217365265, + "step": 14860 + }, + { + "ce_loss": 0.09910108894109726, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "distill_loss": 0.14979346096515656, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "ref_ce_loss": 0.08147692680358887, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "loss": 0.2014983594417572, + "step": 14860 + }, + { + "ce_loss": 0.032280270010232925, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "distill_loss": 0.08623968809843063, + "epoch": 4.9566377585056705, + "step": 14860 + }, + { + "epoch": 4.9566377585056705, + "ref_ce_loss": 0.05587891489267349, + "step": 14860 + }, + { + "epoch": 4.959973315543696, + "loss": 0.3914, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "grad_norm": 4.903111934661865, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "learning_rate": 9.997225382864559e-05, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "loss": 0.2674598693847656, + "step": 14870 + }, + { + "ce_loss": 0.056575529277324677, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "distill_loss": 0.10776714235544205, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "ref_ce_loss": 0.057600509375333786, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "loss": 0.521746039390564, + "step": 14870 + }, + { + "ce_loss": 0.09200442582368851, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "distill_loss": 0.13319039344787598, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "ref_ce_loss": 0.09574585407972336, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "loss": 0.33523058891296387, + "step": 14870 + }, + { + "ce_loss": 0.06467235833406448, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "distill_loss": 0.11361615359783173, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "ref_ce_loss": 0.09359608590602875, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "loss": 0.5538613200187683, + "step": 14870 + }, + { + "ce_loss": 0.17402806878089905, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "distill_loss": 0.16963981091976166, + "epoch": 4.959973315543696, + "step": 14870 + }, + { + "epoch": 4.959973315543696, + "ref_ce_loss": 0.12446580082178116, + "step": 14870 + }, + { + "epoch": 4.963308872581721, + "loss": 0.4007, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "grad_norm": 2.6322154998779297, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "learning_rate": 9.97813360109147e-05, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "loss": 0.6043708920478821, + "step": 14880 + }, + { + "ce_loss": 0.1652269959449768, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "distill_loss": 0.12191038578748703, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "ref_ce_loss": 0.10965677350759506, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "loss": 0.4230524003505707, + "step": 14880 + }, + { + "ce_loss": 0.09956231713294983, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "distill_loss": 0.1212780550122261, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "ref_ce_loss": 0.09010248631238937, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "loss": 0.713090181350708, + "step": 14880 + }, + { + "ce_loss": 0.17262235283851624, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "distill_loss": 0.15157324075698853, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "ref_ce_loss": 0.10012070834636688, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "loss": 0.40841516852378845, + "step": 14880 + }, + { + "ce_loss": 0.14642496407032013, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "distill_loss": 0.11657308787107468, + "epoch": 4.963308872581721, + "step": 14880 + }, + { + "epoch": 4.963308872581721, + "ref_ce_loss": 0.109087735414505, + "step": 14880 + }, + { + "epoch": 4.9666444296197465, + "loss": 0.4133, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "grad_norm": 1.9244799613952637, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "learning_rate": 9.959050977221732e-05, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "loss": 0.8595991134643555, + "step": 14890 + }, + { + "ce_loss": 0.17607881128787994, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "distill_loss": 0.10831549018621445, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "ref_ce_loss": 0.08201535046100616, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "loss": 0.3811852037906647, + "step": 14890 + }, + { + "ce_loss": 0.06830538064241409, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "distill_loss": 0.09254138171672821, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "ref_ce_loss": 0.07486068457365036, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "loss": 0.5785902738571167, + "step": 14890 + }, + { + "ce_loss": 0.16134631633758545, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "distill_loss": 0.1212388202548027, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "ref_ce_loss": 0.10440519452095032, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "loss": 0.2205401211977005, + "step": 14890 + }, + { + "ce_loss": 0.03984503075480461, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "distill_loss": 0.09075970202684402, + "epoch": 4.9666444296197465, + "step": 14890 + }, + { + "epoch": 4.9666444296197465, + "ref_ce_loss": 0.08843724429607391, + "step": 14890 + }, + { + "epoch": 4.969979986657772, + "loss": 0.3771, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "grad_norm": 1.7490321397781372, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "learning_rate": 9.939977546054517e-05, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "loss": 0.26214566826820374, + "step": 14900 + }, + { + "ce_loss": 0.05214730277657509, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "distill_loss": 0.11806771904230118, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "ref_ce_loss": 0.0917915552854538, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "loss": 0.4815196990966797, + "step": 14900 + }, + { + "ce_loss": 0.08143611997365952, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "distill_loss": 0.09976162016391754, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "ref_ce_loss": 0.08829763531684875, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "loss": 0.48579132556915283, + "step": 14900 + }, + { + "ce_loss": 0.09691136330366135, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "distill_loss": 0.12712033092975616, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "ref_ce_loss": 0.08161374181509018, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "loss": 0.32713285088539124, + "step": 14900 + }, + { + "ce_loss": 0.10395216941833496, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "distill_loss": 0.11791080236434937, + "epoch": 4.969979986657772, + "step": 14900 + }, + { + "epoch": 4.969979986657772, + "ref_ce_loss": 0.07737979292869568, + "step": 14900 + }, + { + "epoch": 4.973315543695797, + "loss": 0.3677, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "grad_norm": 4.023821830749512, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "learning_rate": 9.92091334237224e-05, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "loss": 0.17526739835739136, + "step": 14910 + }, + { + "ce_loss": 0.0291630607098341, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "distill_loss": 0.08895475417375565, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "ref_ce_loss": 0.056943051517009735, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "loss": 0.5921517610549927, + "step": 14910 + }, + { + "ce_loss": 0.14450669288635254, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "distill_loss": 0.15296611189842224, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "ref_ce_loss": 0.11224636435508728, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "loss": 0.2923111915588379, + "step": 14910 + }, + { + "ce_loss": 0.09236502647399902, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "distill_loss": 0.11198394745588303, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "ref_ce_loss": 0.08783869445323944, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "loss": 0.45573416352272034, + "step": 14910 + }, + { + "ce_loss": 0.18862444162368774, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "distill_loss": 0.14855432510375977, + "epoch": 4.973315543695797, + "step": 14910 + }, + { + "epoch": 4.973315543695797, + "ref_ce_loss": 0.11837711930274963, + "step": 14910 + }, + { + "epoch": 4.9766511007338226, + "loss": 0.3831, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "grad_norm": 1.6999053955078125, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "learning_rate": 9.901858400940496e-05, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "loss": 0.22062714397907257, + "step": 14920 + }, + { + "ce_loss": 0.041478704661130905, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "distill_loss": 0.08729581534862518, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "ref_ce_loss": 0.04799078777432442, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "loss": 0.5395979881286621, + "step": 14920 + }, + { + "ce_loss": 0.18839100003242493, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "distill_loss": 0.17057166993618011, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "ref_ce_loss": 0.09586435556411743, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "loss": 0.7154274582862854, + "step": 14920 + }, + { + "ce_loss": 0.17241571843624115, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "distill_loss": 0.13662411272525787, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "ref_ce_loss": 0.1351361721754074, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "loss": 0.5005620121955872, + "step": 14920 + }, + { + "ce_loss": 0.15116122364997864, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "distill_loss": 0.12960541248321533, + "epoch": 4.9766511007338226, + "step": 14920 + }, + { + "epoch": 4.9766511007338226, + "ref_ce_loss": 0.11357004195451736, + "step": 14920 + }, + { + "epoch": 4.979986657771848, + "loss": 0.4151, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "grad_norm": 2.9988279342651367, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "learning_rate": 9.88281275650797e-05, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "loss": 0.3005850613117218, + "step": 14930 + }, + { + "ce_loss": 0.11170001327991486, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "distill_loss": 0.12441539764404297, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "ref_ce_loss": 0.06426378339529037, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "loss": 0.4188843071460724, + "step": 14930 + }, + { + "ce_loss": 0.10497825592756271, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "distill_loss": 0.10721724480390549, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "ref_ce_loss": 0.07746762037277222, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "loss": 0.3971782922744751, + "step": 14930 + }, + { + "ce_loss": 0.03395448625087738, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "distill_loss": 0.10532855242490768, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "ref_ce_loss": 0.07476559281349182, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "loss": 0.16209197044372559, + "step": 14930 + }, + { + "ce_loss": 0.028277037665247917, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "distill_loss": 0.06412121653556824, + "epoch": 4.979986657771848, + "step": 14930 + }, + { + "epoch": 4.979986657771848, + "ref_ce_loss": 0.03951544687151909, + "step": 14930 + }, + { + "epoch": 4.983322214809873, + "loss": 0.3849, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "grad_norm": 2.4632937908172607, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "learning_rate": 9.863776443806414e-05, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "loss": 0.3252128064632416, + "step": 14940 + }, + { + "ce_loss": 0.08578341454267502, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "distill_loss": 0.10882560163736343, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "ref_ce_loss": 0.0696091502904892, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "loss": 0.4829108417034149, + "step": 14940 + }, + { + "ce_loss": 0.17730078101158142, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "distill_loss": 0.125204935669899, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "ref_ce_loss": 0.10830342024564743, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "loss": 0.2506943643093109, + "step": 14940 + }, + { + "ce_loss": 0.015404731966555119, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "distill_loss": 0.11019233614206314, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "ref_ce_loss": 0.07074945420026779, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "loss": 0.3966948986053467, + "step": 14940 + }, + { + "ce_loss": 0.08755076676607132, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "distill_loss": 0.10180173814296722, + "epoch": 4.983322214809873, + "step": 14940 + }, + { + "epoch": 4.983322214809873, + "ref_ce_loss": 0.07565634697675705, + "step": 14940 + }, + { + "epoch": 4.986657771847899, + "loss": 0.4103, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "grad_norm": 3.3751020431518555, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "learning_rate": 9.844749497550549e-05, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "loss": 0.38711386919021606, + "step": 14950 + }, + { + "ce_loss": 0.12462891638278961, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "distill_loss": 0.12855587899684906, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "ref_ce_loss": 0.10362184047698975, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "loss": 0.36301493644714355, + "step": 14950 + }, + { + "ce_loss": 0.14747299253940582, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "distill_loss": 0.09996884316205978, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "ref_ce_loss": 0.11533646285533905, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "loss": 0.39188283681869507, + "step": 14950 + }, + { + "ce_loss": 0.058999788016080856, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "distill_loss": 0.12072397768497467, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "ref_ce_loss": 0.11123237013816833, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "loss": 0.285819947719574, + "step": 14950 + }, + { + "ce_loss": 0.08039289712905884, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "distill_loss": 0.09627498686313629, + "epoch": 4.986657771847899, + "step": 14950 + }, + { + "epoch": 4.986657771847899, + "ref_ce_loss": 0.07560764998197556, + "step": 14950 + }, + { + "epoch": 4.989993328885924, + "loss": 0.3899, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "grad_norm": 3.0399069786071777, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "learning_rate": 9.825731952438019e-05, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "loss": 0.2565562427043915, + "step": 14960 + }, + { + "ce_loss": 0.031080681830644608, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "distill_loss": 0.12158703804016113, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "ref_ce_loss": 0.0760444924235344, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "loss": 0.32719331979751587, + "step": 14960 + }, + { + "ce_loss": 0.12624545395374298, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "distill_loss": 0.08923916518688202, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "ref_ce_loss": 0.07610367238521576, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "loss": 0.3031754493713379, + "step": 14960 + }, + { + "ce_loss": 0.102663055062294, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "distill_loss": 0.12523791193962097, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "ref_ce_loss": 0.06037063151597977, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "loss": 0.1924704611301422, + "step": 14960 + }, + { + "ce_loss": 0.052973054349422455, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "distill_loss": 0.08791524171829224, + "epoch": 4.989993328885924, + "step": 14960 + }, + { + "epoch": 4.989993328885924, + "ref_ce_loss": 0.03767343983054161, + "step": 14960 + }, + { + "epoch": 4.993328885923949, + "loss": 0.4152, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "grad_norm": 2.185476064682007, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "learning_rate": 9.806723843149328e-05, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "loss": 0.5093054175376892, + "step": 14970 + }, + { + "ce_loss": 0.2362217903137207, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "distill_loss": 0.13822926580905914, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "ref_ce_loss": 0.1347385048866272, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "loss": 0.4155021905899048, + "step": 14970 + }, + { + "ce_loss": 0.0930529236793518, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "distill_loss": 0.12571507692337036, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "ref_ce_loss": 0.1019885390996933, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "loss": 0.5457403659820557, + "step": 14970 + }, + { + "ce_loss": 0.2082226276397705, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "distill_loss": 0.14089271426200867, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "ref_ce_loss": 0.11160488426685333, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "loss": 0.2308986932039261, + "step": 14970 + }, + { + "ce_loss": 0.05669495090842247, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "distill_loss": 0.10594461858272552, + "epoch": 4.993328885923949, + "step": 14970 + }, + { + "epoch": 4.993328885923949, + "ref_ce_loss": 0.046271566301584244, + "step": 14970 + }, + { + "epoch": 4.996664442961975, + "loss": 0.3631, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "grad_norm": 2.700335741043091, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "learning_rate": 9.787725204347764e-05, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "loss": 0.42493465542793274, + "step": 14980 + }, + { + "ce_loss": 0.14343814551830292, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "distill_loss": 0.11766191571950912, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "ref_ce_loss": 0.08248550444841385, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "loss": 0.3019244074821472, + "step": 14980 + }, + { + "ce_loss": 0.06459664553403854, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "distill_loss": 0.07496616989374161, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "ref_ce_loss": 0.06792156398296356, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "loss": 0.2959771156311035, + "step": 14980 + }, + { + "ce_loss": 0.050680242478847504, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "distill_loss": 0.10879239439964294, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "ref_ce_loss": 0.08710350841283798, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "loss": 0.32662272453308105, + "step": 14980 + }, + { + "ce_loss": 0.11703333258628845, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "distill_loss": 0.10472206771373749, + "epoch": 4.996664442961975, + "step": 14980 + }, + { + "epoch": 4.996664442961975, + "ref_ce_loss": 0.06959205120801926, + "step": 14980 + }, + { + "epoch": 5.0, + "loss": 0.3927, + "step": 14990 + }, + { + "epoch": 5.0, + "grad_norm": 3.773092031478882, + "step": 14990 + }, + { + "epoch": 5.0, + "learning_rate": 9.768736070679355e-05, + "step": 14990 + }, + { + "epoch": 5.0, + "loss": 0.24565860629081726, + "step": 14990 + }, + { + "ce_loss": 0.04676143452525139, + "epoch": 5.0, + "step": 14990 + }, + { + "distill_loss": 0.10307568311691284, + "epoch": 5.0, + "step": 14990 + }, + { + "epoch": 5.0, + "ref_ce_loss": 0.09563593566417694, + "step": 14990 + }, + { + "epoch": 5.0, + "loss": 0.2170976996421814, + "step": 14990 + }, + { + "ce_loss": 0.04341982305049896, + "epoch": 5.0, + "step": 14990 + }, + { + "distill_loss": 0.09917079657316208, + "epoch": 5.0, + "step": 14990 + }, + { + "epoch": 5.0, + "ref_ce_loss": 0.06050785630941391, + "step": 14990 + }, + { + "epoch": 5.0, + "loss": 0.27760088443756104, + "step": 14990 + }, + { + "ce_loss": 0.02374977245926857, + "epoch": 5.0, + "step": 14990 + }, + { + "distill_loss": 0.0888613685965538, + "epoch": 5.0, + "step": 14990 + }, + { + "epoch": 5.0, + "ref_ce_loss": 0.07210943847894669, + "step": 14990 + }, + { + "epoch": 5.0, + "loss": 0.3771913945674896, + "step": 14990 + }, + { + "ce_loss": 0.13417477905750275, + "epoch": 5.0, + "step": 14990 + }, + { + "distill_loss": 0.1130586713552475, + "epoch": 5.0, + "step": 14990 + }, + { + "epoch": 5.0, + "ref_ce_loss": 0.08856955170631409, + "step": 14990 + }, + { + "epoch": 5.003335557038025, + "loss": 0.2995, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "grad_norm": 1.9504388570785522, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "learning_rate": 9.749756476772786e-05, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "loss": 0.2518184185028076, + "step": 15000 + }, + { + "ce_loss": 0.027764635160565376, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "distill_loss": 0.11909808963537216, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "ref_ce_loss": 0.06962376087903976, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "loss": 0.3267233073711395, + "step": 15000 + }, + { + "ce_loss": 0.05888934060931206, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "distill_loss": 0.10403020679950714, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "ref_ce_loss": 0.04899250343441963, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "loss": 0.2693495452404022, + "step": 15000 + }, + { + "ce_loss": 0.09646335989236832, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "distill_loss": 0.09357049316167831, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "ref_ce_loss": 0.07913413643836975, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "loss": 0.2770167589187622, + "step": 15000 + }, + { + "ce_loss": 0.03341980651021004, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "distill_loss": 0.10253458470106125, + "epoch": 5.003335557038025, + "step": 15000 + }, + { + "epoch": 5.003335557038025, + "ref_ce_loss": 0.0455530546605587, + "step": 15000 + }, + { + "epoch": 5.006671114076051, + "loss": 0.3276, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "grad_norm": 3.6639962196350098, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "learning_rate": 9.73078645723935e-05, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "loss": 0.3098304271697998, + "step": 15010 + }, + { + "ce_loss": 0.06160301715135574, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "distill_loss": 0.11260523647069931, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "ref_ce_loss": 0.13522294163703918, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "loss": 0.20364679396152496, + "step": 15010 + }, + { + "ce_loss": 0.0429227389395237, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "distill_loss": 0.07736965268850327, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "ref_ce_loss": 0.08260839432477951, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "loss": 0.22606542706489563, + "step": 15010 + }, + { + "ce_loss": 0.05941081792116165, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "distill_loss": 0.11994045972824097, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "ref_ce_loss": 0.03545362129807472, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "loss": 0.28638121485710144, + "step": 15010 + }, + { + "ce_loss": 0.09630145877599716, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "distill_loss": 0.10460197925567627, + "epoch": 5.006671114076051, + "step": 15010 + }, + { + "epoch": 5.006671114076051, + "ref_ce_loss": 0.06608293205499649, + "step": 15010 + }, + { + "epoch": 5.010006671114076, + "loss": 0.3134, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "grad_norm": 1.8041044473648071, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "learning_rate": 9.711826046672886e-05, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "loss": 0.5290608406066895, + "step": 15020 + }, + { + "ce_loss": 0.08037453144788742, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "distill_loss": 0.11784839630126953, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "ref_ce_loss": 0.07413246482610703, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "loss": 0.31020018458366394, + "step": 15020 + }, + { + "ce_loss": 0.06627956032752991, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "distill_loss": 0.09461265802383423, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "ref_ce_loss": 0.07771609723567963, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "loss": 0.2674647271633148, + "step": 15020 + }, + { + "ce_loss": 0.08439767360687256, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "distill_loss": 0.09544394165277481, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "ref_ce_loss": 0.06403058767318726, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "loss": 0.23964346945285797, + "step": 15020 + }, + { + "ce_loss": 0.051768235862255096, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "distill_loss": 0.09334868937730789, + "epoch": 5.010006671114076, + "step": 15020 + }, + { + "epoch": 5.010006671114076, + "ref_ce_loss": 0.07185187935829163, + "step": 15020 + }, + { + "epoch": 5.013342228152101, + "loss": 0.3388, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "grad_norm": 5.189137935638428, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "learning_rate": 9.692875279649694e-05, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "loss": 0.34460267424583435, + "step": 15030 + }, + { + "ce_loss": 0.1255204826593399, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "distill_loss": 0.11821456253528595, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "ref_ce_loss": 0.10058291256427765, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "loss": 0.3356470465660095, + "step": 15030 + }, + { + "ce_loss": 0.032778576016426086, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "distill_loss": 0.10354934632778168, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "ref_ce_loss": 0.08033135533332825, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "loss": 0.41687262058258057, + "step": 15030 + }, + { + "ce_loss": 0.07428724318742752, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "distill_loss": 0.11658826470375061, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "ref_ce_loss": 0.0638238936662674, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "loss": 0.3669113218784332, + "step": 15030 + }, + { + "ce_loss": 0.11058314144611359, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "distill_loss": 0.1107931137084961, + "epoch": 5.013342228152101, + "step": 15030 + }, + { + "epoch": 5.013342228152101, + "ref_ce_loss": 0.09497855603694916, + "step": 15030 + }, + { + "epoch": 5.016677785190127, + "loss": 0.3591, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "grad_norm": 2.142228603363037, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "learning_rate": 9.6739341907285e-05, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "loss": 0.3057650625705719, + "step": 15040 + }, + { + "ce_loss": 0.08420991897583008, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "distill_loss": 0.11546208709478378, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "ref_ce_loss": 0.0664733350276947, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "loss": 0.37464606761932373, + "step": 15040 + }, + { + "ce_loss": 0.0816805511713028, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "distill_loss": 0.13084836304187775, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "ref_ce_loss": 0.07238293439149857, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "loss": 0.22750362753868103, + "step": 15040 + }, + { + "ce_loss": 0.027732811868190765, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "distill_loss": 0.06788545101881027, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "ref_ce_loss": 0.05549793690443039, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "loss": 0.18696361780166626, + "step": 15040 + }, + { + "ce_loss": 0.04056626930832863, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "distill_loss": 0.09064240753650665, + "epoch": 5.016677785190127, + "step": 15040 + }, + { + "epoch": 5.016677785190127, + "ref_ce_loss": 0.05517708882689476, + "step": 15040 + }, + { + "epoch": 5.020013342228152, + "loss": 0.3265, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "grad_norm": 2.525740385055542, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "learning_rate": 9.655002814450387e-05, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "loss": 0.29186737537384033, + "step": 15050 + }, + { + "ce_loss": 0.054460786283016205, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "distill_loss": 0.08650080114603043, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "ref_ce_loss": 0.052335672080516815, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "loss": 0.22459924221038818, + "step": 15050 + }, + { + "ce_loss": 0.0686323419213295, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "distill_loss": 0.0952027291059494, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "ref_ce_loss": 0.060247596353292465, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "loss": 0.4892418384552002, + "step": 15050 + }, + { + "ce_loss": 0.06871223449707031, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "distill_loss": 0.11085522174835205, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "ref_ce_loss": 0.071814626455307, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "loss": 0.3401147127151489, + "step": 15050 + }, + { + "ce_loss": 0.15101785957813263, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "distill_loss": 0.12028548866510391, + "epoch": 5.020013342228152, + "step": 15050 + }, + { + "epoch": 5.020013342228152, + "ref_ce_loss": 0.04461260139942169, + "step": 15050 + }, + { + "epoch": 5.0233488992661774, + "loss": 0.3523, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "grad_norm": 2.574275016784668, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "learning_rate": 9.636081185338707e-05, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "loss": 0.19955489039421082, + "step": 15060 + }, + { + "ce_loss": 0.021272744983434677, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "distill_loss": 0.0945524051785469, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "ref_ce_loss": 0.05813675373792648, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "loss": 0.5390908122062683, + "step": 15060 + }, + { + "ce_loss": 0.18993806838989258, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "distill_loss": 0.12777474522590637, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "ref_ce_loss": 0.10478173196315765, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "loss": 0.5204997062683105, + "step": 15060 + }, + { + "ce_loss": 0.06704582273960114, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "distill_loss": 0.15518198907375336, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "ref_ce_loss": 0.07022137194871902, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "loss": 0.2526065707206726, + "step": 15060 + }, + { + "ce_loss": 0.037358131259679794, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "distill_loss": 0.09248964488506317, + "epoch": 5.0233488992661774, + "step": 15060 + }, + { + "epoch": 5.0233488992661774, + "ref_ce_loss": 0.08157315850257874, + "step": 15060 + }, + { + "epoch": 5.026684456304203, + "loss": 0.3519, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "grad_norm": 3.2034337520599365, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "learning_rate": 9.617169337899059e-05, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "loss": 0.40385645627975464, + "step": 15070 + }, + { + "ce_loss": 0.12731719017028809, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "distill_loss": 0.1379529982805252, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "ref_ce_loss": 0.07186760008335114, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "loss": 0.3676081895828247, + "step": 15070 + }, + { + "ce_loss": 0.06205548718571663, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "distill_loss": 0.13433827459812164, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "ref_ce_loss": 0.08207282423973083, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "loss": 0.4785653352737427, + "step": 15070 + }, + { + "ce_loss": 0.172767773270607, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "distill_loss": 0.18138282001018524, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "ref_ce_loss": 0.06890442222356796, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "loss": 0.20031343400478363, + "step": 15070 + }, + { + "ce_loss": 0.03773951530456543, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "distill_loss": 0.09012091904878616, + "epoch": 5.026684456304203, + "step": 15070 + }, + { + "epoch": 5.026684456304203, + "ref_ce_loss": 0.07225586473941803, + "step": 15070 + }, + { + "epoch": 5.030020013342228, + "loss": 0.4076, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "grad_norm": 2.200860023498535, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "learning_rate": 9.598267306619173e-05, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "loss": 0.3781607449054718, + "step": 15080 + }, + { + "ce_loss": 0.11247001588344574, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "distill_loss": 0.15823885798454285, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "ref_ce_loss": 0.08177276700735092, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "loss": 0.4920407831668854, + "step": 15080 + }, + { + "ce_loss": 0.14705343544483185, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "distill_loss": 0.19593298435211182, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "ref_ce_loss": 0.11216946691274643, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "loss": 0.218006432056427, + "step": 15080 + }, + { + "ce_loss": 0.03527948632836342, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "distill_loss": 0.0989982932806015, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "ref_ce_loss": 0.08341675996780396, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "loss": 0.24107953906059265, + "step": 15080 + }, + { + "ce_loss": 0.043232500553131104, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "distill_loss": 0.12532421946525574, + "epoch": 5.030020013342228, + "step": 15080 + }, + { + "epoch": 5.030020013342228, + "ref_ce_loss": 0.0574118047952652, + "step": 15080 + }, + { + "epoch": 5.0333555703802535, + "loss": 0.3412, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "grad_norm": 2.083111047744751, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "learning_rate": 9.579375125968917e-05, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "loss": 0.44608885049819946, + "step": 15090 + }, + { + "ce_loss": 0.1435285210609436, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "distill_loss": 0.13029739260673523, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "ref_ce_loss": 0.12861613929271698, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "loss": 0.28388845920562744, + "step": 15090 + }, + { + "ce_loss": 0.04726126790046692, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "distill_loss": 0.10819245129823685, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "ref_ce_loss": 0.06490429490804672, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "loss": 0.34822142124176025, + "step": 15090 + }, + { + "ce_loss": 0.08054547011852264, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "distill_loss": 0.14222362637519836, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "ref_ce_loss": 0.08700346946716309, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "loss": 0.1939743012189865, + "step": 15090 + }, + { + "ce_loss": 0.035219430923461914, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "distill_loss": 0.08615106344223022, + "epoch": 5.0333555703802535, + "step": 15090 + }, + { + "epoch": 5.0333555703802535, + "ref_ce_loss": 0.07249585539102554, + "step": 15090 + }, + { + "epoch": 5.036691127418279, + "loss": 0.329, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "grad_norm": 2.6159894466400146, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "learning_rate": 9.560492830400172e-05, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "loss": 0.19131746888160706, + "step": 15100 + }, + { + "ce_loss": 0.039891522377729416, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "distill_loss": 0.0966179370880127, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "ref_ce_loss": 0.05468432232737541, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "loss": 0.38014402985572815, + "step": 15100 + }, + { + "ce_loss": 0.06777114421129227, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "distill_loss": 0.1002044826745987, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "ref_ce_loss": 0.056828632950782776, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "loss": 0.3731182813644409, + "step": 15100 + }, + { + "ce_loss": 0.11887678503990173, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "distill_loss": 0.11022542417049408, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "ref_ce_loss": 0.11073588579893112, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "loss": 0.317482590675354, + "step": 15100 + }, + { + "ce_loss": 0.05223778262734413, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "distill_loss": 0.10656572878360748, + "epoch": 5.036691127418279, + "step": 15100 + }, + { + "epoch": 5.036691127418279, + "ref_ce_loss": 0.06900766491889954, + "step": 15100 + }, + { + "epoch": 5.040026684456304, + "loss": 0.3281, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "grad_norm": 2.9384427070617676, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "learning_rate": 9.541620454346787e-05, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "loss": 0.31656724214553833, + "step": 15110 + }, + { + "ce_loss": 0.09671363234519958, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "distill_loss": 0.11364702135324478, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "ref_ce_loss": 0.106138676404953, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "loss": 0.31608346104621887, + "step": 15110 + }, + { + "ce_loss": 0.11589302867650986, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "distill_loss": 0.10061141848564148, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "ref_ce_loss": 0.09926266223192215, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "loss": 0.5507553815841675, + "step": 15110 + }, + { + "ce_loss": 0.19675405323505402, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "distill_loss": 0.09251846373081207, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "ref_ce_loss": 0.12421887367963791, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "loss": 0.28572094440460205, + "step": 15110 + }, + { + "ce_loss": 0.046184901148080826, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "distill_loss": 0.1092517301440239, + "epoch": 5.040026684456304, + "step": 15110 + }, + { + "epoch": 5.040026684456304, + "ref_ce_loss": 0.084798164665699, + "step": 15110 + }, + { + "epoch": 5.0433622414943295, + "loss": 0.3422, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "grad_norm": 4.081774711608887, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "learning_rate": 9.522758032224545e-05, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "loss": 0.493918776512146, + "step": 15120 + }, + { + "ce_loss": 0.1023336723446846, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "distill_loss": 0.12242228537797928, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "ref_ce_loss": 0.053322214633226395, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "loss": 0.29232460260391235, + "step": 15120 + }, + { + "ce_loss": 0.05161577835679054, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "distill_loss": 0.10842837393283844, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "ref_ce_loss": 0.05500758811831474, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "loss": 0.26376548409461975, + "step": 15120 + }, + { + "ce_loss": 0.07995926588773727, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "distill_loss": 0.09874370694160461, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "ref_ce_loss": 0.08495025336742401, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "loss": 0.2894538938999176, + "step": 15120 + }, + { + "ce_loss": 0.04029352590441704, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "distill_loss": 0.13173624873161316, + "epoch": 5.0433622414943295, + "step": 15120 + }, + { + "epoch": 5.0433622414943295, + "ref_ce_loss": 0.0760183110833168, + "step": 15120 + }, + { + "epoch": 5.046697798532355, + "loss": 0.3141, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "grad_norm": 2.2110354900360107, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "learning_rate": 9.503905598431053e-05, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "loss": 0.30037644505500793, + "step": 15130 + }, + { + "ce_loss": 0.10891405493021011, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "distill_loss": 0.11043551564216614, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "ref_ce_loss": 0.06152831390500069, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "loss": 0.2595139443874359, + "step": 15130 + }, + { + "ce_loss": 0.08509143441915512, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "distill_loss": 0.1089167594909668, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "ref_ce_loss": 0.04903317987918854, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "loss": 0.24324160814285278, + "step": 15130 + }, + { + "ce_loss": 0.03422941640019417, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "distill_loss": 0.12488461285829544, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "ref_ce_loss": 0.0839512050151825, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "loss": 0.9537922739982605, + "step": 15130 + }, + { + "ce_loss": 0.10920923948287964, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "distill_loss": 0.11429134011268616, + "epoch": 5.046697798532355, + "step": 15130 + }, + { + "epoch": 5.046697798532355, + "ref_ce_loss": 0.07421170175075531, + "step": 15130 + }, + { + "epoch": 5.05003335557038, + "loss": 0.3921, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "grad_norm": 2.547940969467163, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "learning_rate": 9.485063187345712e-05, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "loss": 0.6976451873779297, + "step": 15140 + }, + { + "ce_loss": 0.09503049403429031, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "distill_loss": 0.14060279726982117, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "ref_ce_loss": 0.0637066587805748, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "loss": 0.301154226064682, + "step": 15140 + }, + { + "ce_loss": 0.03091120533645153, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "distill_loss": 0.11131655424833298, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "ref_ce_loss": 0.055801503360271454, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "loss": 0.3734534680843353, + "step": 15140 + }, + { + "ce_loss": 0.07543478161096573, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "distill_loss": 0.13863107562065125, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "ref_ce_loss": 0.10265032202005386, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "loss": 0.20870301127433777, + "step": 15140 + }, + { + "ce_loss": 0.012648079544305801, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "distill_loss": 0.091103196144104, + "epoch": 5.05003335557038, + "step": 15140 + }, + { + "epoch": 5.05003335557038, + "ref_ce_loss": 0.04741125553846359, + "step": 15140 + }, + { + "epoch": 5.053368912608406, + "loss": 0.3344, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "grad_norm": 3.0254764556884766, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "learning_rate": 9.466230833329663e-05, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "loss": 0.33127936720848083, + "step": 15150 + }, + { + "ce_loss": 0.045439403504133224, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "distill_loss": 0.11033318191766739, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "ref_ce_loss": 0.09881974011659622, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "loss": 0.24848978221416473, + "step": 15150 + }, + { + "ce_loss": 0.08748488873243332, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "distill_loss": 0.10021417587995529, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "ref_ce_loss": 0.05940258502960205, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "loss": 0.2611362934112549, + "step": 15150 + }, + { + "ce_loss": 0.06973495334386826, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "distill_loss": 0.11396859586238861, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "ref_ce_loss": 0.07708052545785904, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "loss": 0.510417640209198, + "step": 15150 + }, + { + "ce_loss": 0.14811956882476807, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "distill_loss": 0.13365697860717773, + "epoch": 5.053368912608406, + "step": 15150 + }, + { + "epoch": 5.053368912608406, + "ref_ce_loss": 0.0729028657078743, + "step": 15150 + }, + { + "epoch": 5.056704469646431, + "loss": 0.3312, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "grad_norm": 2.0235612392425537, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "learning_rate": 9.447408570725673e-05, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "loss": 0.2727489173412323, + "step": 15160 + }, + { + "ce_loss": 0.0502191036939621, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "distill_loss": 0.11405930668115616, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "ref_ce_loss": 0.07889159768819809, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "loss": 0.25519660115242004, + "step": 15160 + }, + { + "ce_loss": 0.09428223967552185, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "distill_loss": 0.10339818894863129, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "ref_ce_loss": 0.05715091899037361, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "loss": 0.2483268678188324, + "step": 15160 + }, + { + "ce_loss": 0.054164640605449677, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "distill_loss": 0.08527297526597977, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "ref_ce_loss": 0.0554942861199379, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "loss": 0.35373765230178833, + "step": 15160 + }, + { + "ce_loss": 0.05942567065358162, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "distill_loss": 0.1203283965587616, + "epoch": 5.056704469646431, + "step": 15160 + }, + { + "epoch": 5.056704469646431, + "ref_ce_loss": 0.08127188682556152, + "step": 15160 + }, + { + "epoch": 5.060040026684456, + "loss": 0.3311, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "grad_norm": 2.827293872833252, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "learning_rate": 9.428596433858136e-05, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "loss": 0.2379085123538971, + "step": 15170 + }, + { + "ce_loss": 0.041343193501234055, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "distill_loss": 0.13744580745697021, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "ref_ce_loss": 0.03911873325705528, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "loss": 0.5597814917564392, + "step": 15170 + }, + { + "ce_loss": 0.04659683257341385, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "distill_loss": 0.07799013704061508, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "ref_ce_loss": 0.08074043691158295, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "loss": 0.3475240170955658, + "step": 15170 + }, + { + "ce_loss": 0.0347454808652401, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "distill_loss": 0.10918953269720078, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "ref_ce_loss": 0.1333533227443695, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "loss": 0.33609458804130554, + "step": 15170 + }, + { + "ce_loss": 0.1237572431564331, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "distill_loss": 0.12210750579833984, + "epoch": 5.060040026684456, + "step": 15170 + }, + { + "epoch": 5.060040026684456, + "ref_ce_loss": 0.08911029249429703, + "step": 15170 + }, + { + "epoch": 5.063375583722482, + "loss": 0.3352, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "grad_norm": 1.6944329738616943, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "learning_rate": 9.409794457032959e-05, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "loss": 0.3432515859603882, + "step": 15180 + }, + { + "ce_loss": 0.0322825089097023, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "distill_loss": 0.11505106091499329, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "ref_ce_loss": 0.06054326146841049, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "loss": 0.2868589460849762, + "step": 15180 + }, + { + "ce_loss": 0.05462491139769554, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "distill_loss": 0.10324221104383469, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "ref_ce_loss": 0.07764450460672379, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "loss": 0.41091442108154297, + "step": 15180 + }, + { + "ce_loss": 0.14110834896564484, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "distill_loss": 0.13619858026504517, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "ref_ce_loss": 0.08693759888410568, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "loss": 0.37911006808280945, + "step": 15180 + }, + { + "ce_loss": 0.09819250553846359, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "distill_loss": 0.13968545198440552, + "epoch": 5.063375583722482, + "step": 15180 + }, + { + "epoch": 5.063375583722482, + "ref_ce_loss": 0.06106346845626831, + "step": 15180 + }, + { + "epoch": 5.066711140760507, + "loss": 0.3193, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "grad_norm": 2.3509528636932373, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "learning_rate": 9.391002674537538e-05, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "loss": 0.4000142812728882, + "step": 15190 + }, + { + "ce_loss": 0.07615770399570465, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "distill_loss": 0.15297187864780426, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "ref_ce_loss": 0.09272449463605881, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "loss": 0.49558955430984497, + "step": 15190 + }, + { + "ce_loss": 0.12531791627407074, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "distill_loss": 0.12746287882328033, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "ref_ce_loss": 0.09971451759338379, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "loss": 0.17504757642745972, + "step": 15190 + }, + { + "ce_loss": 0.02030886709690094, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "distill_loss": 0.0881674736738205, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "ref_ce_loss": 0.06651587039232254, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "loss": 0.2239861637353897, + "step": 15190 + }, + { + "ce_loss": 0.023175470530986786, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "distill_loss": 0.09253476560115814, + "epoch": 5.066711140760507, + "step": 15190 + }, + { + "epoch": 5.066711140760507, + "ref_ce_loss": 0.06468259543180466, + "step": 15190 + }, + { + "epoch": 5.070046697798532, + "loss": 0.2986, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "grad_norm": 3.5859014987945557, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "learning_rate": 9.37222112064067e-05, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "loss": 0.431865930557251, + "step": 15200 + }, + { + "ce_loss": 0.03801083192229271, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "distill_loss": 0.09702988713979721, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "ref_ce_loss": 0.08860328048467636, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "loss": 0.3617652654647827, + "step": 15200 + }, + { + "ce_loss": 0.09015195816755295, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "distill_loss": 0.10294674336910248, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "ref_ce_loss": 0.06347037851810455, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "loss": 0.19948525726795197, + "step": 15200 + }, + { + "ce_loss": 0.04417693242430687, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "distill_loss": 0.09427918493747711, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "ref_ce_loss": 0.05085763707756996, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "loss": 0.24006901681423187, + "step": 15200 + }, + { + "ce_loss": 0.049805957823991776, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "distill_loss": 0.06830768287181854, + "epoch": 5.070046697798532, + "step": 15200 + }, + { + "epoch": 5.070046697798532, + "ref_ce_loss": 0.0619424544274807, + "step": 15200 + }, + { + "epoch": 5.073382254836558, + "loss": 0.3709, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "grad_norm": 4.331812858581543, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "learning_rate": 9.353449829592502e-05, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "loss": 0.3576752841472626, + "step": 15210 + }, + { + "ce_loss": 0.09517745673656464, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "distill_loss": 0.09828440099954605, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "ref_ce_loss": 0.10009924322366714, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "loss": 0.21468718349933624, + "step": 15210 + }, + { + "ce_loss": 0.044741351157426834, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "distill_loss": 0.07588415592908859, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "ref_ce_loss": 0.06994284689426422, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "loss": 0.5331860184669495, + "step": 15210 + }, + { + "ce_loss": 0.06379786878824234, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "distill_loss": 0.0999433696269989, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "ref_ce_loss": 0.05732704699039459, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "loss": 0.2616701126098633, + "step": 15210 + }, + { + "ce_loss": 0.03689726069569588, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "distill_loss": 0.09216289967298508, + "epoch": 5.073382254836558, + "step": 15210 + }, + { + "epoch": 5.073382254836558, + "ref_ce_loss": 0.09889374673366547, + "step": 15210 + }, + { + "epoch": 5.076717811874583, + "loss": 0.3476, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "grad_norm": 2.3121414184570312, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "learning_rate": 9.334688835624459e-05, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "loss": 0.472523033618927, + "step": 15220 + }, + { + "ce_loss": 0.11912497878074646, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "distill_loss": 0.1291174292564392, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "ref_ce_loss": 0.11859400570392609, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "loss": 0.30351346731185913, + "step": 15220 + }, + { + "ce_loss": 0.010588656179606915, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "distill_loss": 0.07482418417930603, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "ref_ce_loss": 0.07996892929077148, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "loss": 0.458459734916687, + "step": 15220 + }, + { + "ce_loss": 0.06901762634515762, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "distill_loss": 0.10283029824495316, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "ref_ce_loss": 0.09888451546430588, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "loss": 0.31203603744506836, + "step": 15220 + }, + { + "ce_loss": 0.09299197793006897, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "distill_loss": 0.12244352698326111, + "epoch": 5.076717811874583, + "step": 15220 + }, + { + "epoch": 5.076717811874583, + "ref_ce_loss": 0.07369520515203476, + "step": 15220 + }, + { + "epoch": 5.080053368912608, + "loss": 0.3461, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "grad_norm": 3.2603442668914795, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "learning_rate": 9.3159381729492e-05, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "loss": 0.28526246547698975, + "step": 15230 + }, + { + "ce_loss": 0.09769675880670547, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "distill_loss": 0.10557578504085541, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "ref_ce_loss": 0.05751334875822067, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "loss": 0.1877545714378357, + "step": 15230 + }, + { + "ce_loss": 0.014897209592163563, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "distill_loss": 0.10393606126308441, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "ref_ce_loss": 0.05093152076005936, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "loss": 0.19574418663978577, + "step": 15230 + }, + { + "ce_loss": 0.01883758045732975, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "distill_loss": 0.08023209124803543, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "ref_ce_loss": 0.07024283707141876, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "loss": 0.38844019174575806, + "step": 15230 + }, + { + "ce_loss": 0.12029389292001724, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "distill_loss": 0.12513545155525208, + "epoch": 5.080053368912608, + "step": 15230 + }, + { + "epoch": 5.080053368912608, + "ref_ce_loss": 0.10578837245702744, + "step": 15230 + }, + { + "epoch": 5.083388925950634, + "loss": 0.2986, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "grad_norm": 3.108018159866333, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "learning_rate": 9.297197875760533e-05, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "loss": 0.6390986442565918, + "step": 15240 + }, + { + "ce_loss": 0.13565464317798615, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "distill_loss": 0.12606781721115112, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "ref_ce_loss": 0.08777014166116714, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "loss": 0.27036014199256897, + "step": 15240 + }, + { + "ce_loss": 0.06920570880174637, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "distill_loss": 0.09370496869087219, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "ref_ce_loss": 0.06318096071481705, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "loss": 0.3419868052005768, + "step": 15240 + }, + { + "ce_loss": 0.042620949447155, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "distill_loss": 0.08936305344104767, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "ref_ce_loss": 0.07430487871170044, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "loss": 0.3731623888015747, + "step": 15240 + }, + { + "ce_loss": 0.07372905313968658, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "distill_loss": 0.09781654924154282, + "epoch": 5.083388925950634, + "step": 15240 + }, + { + "epoch": 5.083388925950634, + "ref_ce_loss": 0.0866960659623146, + "step": 15240 + }, + { + "epoch": 5.086724482988659, + "loss": 0.3528, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "grad_norm": 2.033977508544922, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "learning_rate": 9.278467978233372e-05, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "loss": 0.3369258642196655, + "step": 15250 + }, + { + "ce_loss": 0.1479497104883194, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "distill_loss": 0.1182151734828949, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "ref_ce_loss": 0.06997857987880707, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "loss": 0.26905104517936707, + "step": 15250 + }, + { + "ce_loss": 0.04487191140651703, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "distill_loss": 0.08934783935546875, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "ref_ce_loss": 0.06253762543201447, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "loss": 0.39201846718788147, + "step": 15250 + }, + { + "ce_loss": 0.12044522911310196, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "distill_loss": 0.11188692599534988, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "ref_ce_loss": 0.0723666250705719, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "loss": 0.17451854050159454, + "step": 15250 + }, + { + "ce_loss": 0.04381131753325462, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "distill_loss": 0.08575734496116638, + "epoch": 5.086724482988659, + "step": 15250 + }, + { + "epoch": 5.086724482988659, + "ref_ce_loss": 0.0445830300450325, + "step": 15250 + }, + { + "epoch": 5.090060040026684, + "loss": 0.2921, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "grad_norm": 2.043253183364868, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "learning_rate": 9.259748514523653e-05, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "loss": 0.29348182678222656, + "step": 15260 + }, + { + "ce_loss": 0.07424913346767426, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "distill_loss": 0.09006142616271973, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "ref_ce_loss": 0.06587568670511246, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "loss": 0.3772607147693634, + "step": 15260 + }, + { + "ce_loss": 0.08147673308849335, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "distill_loss": 0.09111148118972778, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "ref_ce_loss": 0.12253421545028687, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "loss": 0.4282752275466919, + "step": 15260 + }, + { + "ce_loss": 0.10974805057048798, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "distill_loss": 0.10467756539583206, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "ref_ce_loss": 0.11229056119918823, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "loss": 0.26593005657196045, + "step": 15260 + }, + { + "ce_loss": 0.05944935232400894, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "distill_loss": 0.1361900418996811, + "epoch": 5.090060040026684, + "step": 15260 + }, + { + "epoch": 5.090060040026684, + "ref_ce_loss": 0.07016180455684662, + "step": 15260 + }, + { + "epoch": 5.09339559706471, + "loss": 0.3176, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "grad_norm": 4.857101917266846, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "learning_rate": 9.241039518768301e-05, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "loss": 0.19465944170951843, + "step": 15270 + }, + { + "ce_loss": 0.01948079839348793, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "distill_loss": 0.08734491467475891, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "ref_ce_loss": 0.04057784005999565, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "loss": 0.4380457401275635, + "step": 15270 + }, + { + "ce_loss": 0.09683702141046524, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "distill_loss": 0.13187667727470398, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "ref_ce_loss": 0.08302433043718338, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "loss": 0.42582035064697266, + "step": 15270 + }, + { + "ce_loss": 0.084503673017025, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "distill_loss": 0.10415349155664444, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "ref_ce_loss": 0.03986379876732826, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "loss": 0.2158432900905609, + "step": 15270 + }, + { + "ce_loss": 0.058277517557144165, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "distill_loss": 0.09706753492355347, + "epoch": 5.09339559706471, + "step": 15270 + }, + { + "epoch": 5.09339559706471, + "ref_ce_loss": 0.06036210060119629, + "step": 15270 + }, + { + "epoch": 5.096731154102735, + "loss": 0.3193, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "grad_norm": 1.6487956047058105, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "learning_rate": 9.222341025085144e-05, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "loss": 0.4731646776199341, + "step": 15280 + }, + { + "ce_loss": 0.16261830925941467, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "distill_loss": 0.12539474666118622, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "ref_ce_loss": 0.09363444149494171, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "loss": 0.4620886445045471, + "step": 15280 + }, + { + "ce_loss": 0.06607316434383392, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "distill_loss": 0.12559787929058075, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "ref_ce_loss": 0.08794140070676804, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "loss": 0.21943168342113495, + "step": 15280 + }, + { + "ce_loss": 0.03757490962743759, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "distill_loss": 0.09392014890909195, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "ref_ce_loss": 0.08771252632141113, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "loss": 0.2953905761241913, + "step": 15280 + }, + { + "ce_loss": 0.05430855229496956, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "distill_loss": 0.10314220190048218, + "epoch": 5.096731154102735, + "step": 15280 + }, + { + "epoch": 5.096731154102735, + "ref_ce_loss": 0.04704827815294266, + "step": 15280 + }, + { + "epoch": 5.1000667111407605, + "loss": 0.3289, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "grad_norm": 2.2417662143707275, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "learning_rate": 9.203653067572855e-05, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "loss": 0.25665655732154846, + "step": 15290 + }, + { + "ce_loss": 0.03016096167266369, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "distill_loss": 0.09567056596279144, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "ref_ce_loss": 0.04990023002028465, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "loss": 0.30939507484436035, + "step": 15290 + }, + { + "ce_loss": 0.09582208096981049, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "distill_loss": 0.11035038530826569, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "ref_ce_loss": 0.06970993429422379, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "loss": 0.1940576434135437, + "step": 15290 + }, + { + "ce_loss": 0.04005913808941841, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "distill_loss": 0.09311024844646454, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "ref_ce_loss": 0.06080329045653343, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "loss": 0.4482858180999756, + "step": 15290 + }, + { + "ce_loss": 0.12186356633901596, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "distill_loss": 0.14439421892166138, + "epoch": 5.1000667111407605, + "step": 15290 + }, + { + "epoch": 5.1000667111407605, + "ref_ce_loss": 0.13036789000034332, + "step": 15290 + }, + { + "epoch": 5.103402268178786, + "loss": 0.3659, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "grad_norm": 3.9782018661499023, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "learning_rate": 9.184975680310901e-05, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "loss": 0.2465575784444809, + "step": 15300 + }, + { + "ce_loss": 0.06427565962076187, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "distill_loss": 0.08616838604211807, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "ref_ce_loss": 0.06796655058860779, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "loss": 0.25205734372138977, + "step": 15300 + }, + { + "ce_loss": 0.05255574360489845, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "distill_loss": 0.0944940596818924, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "ref_ce_loss": 0.0681745782494545, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "loss": 0.36056971549987793, + "step": 15300 + }, + { + "ce_loss": 0.1121678277850151, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "distill_loss": 0.13534539937973022, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "ref_ce_loss": 0.07255236059427261, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "loss": 0.7847564220428467, + "step": 15300 + }, + { + "ce_loss": 0.06709320098161697, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "distill_loss": 0.13826727867126465, + "epoch": 5.103402268178786, + "step": 15300 + }, + { + "epoch": 5.103402268178786, + "ref_ce_loss": 0.049133818596601486, + "step": 15300 + }, + { + "epoch": 5.106737825216811, + "loss": 0.3244, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "grad_norm": 4.447413921356201, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "learning_rate": 9.166308897359464e-05, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "loss": 0.35078248381614685, + "step": 15310 + }, + { + "ce_loss": 0.12107393890619278, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "distill_loss": 0.13566362857818604, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "ref_ce_loss": 0.05730048939585686, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "loss": 0.3367649018764496, + "step": 15310 + }, + { + "ce_loss": 0.06390248984098434, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "distill_loss": 0.1456541270017624, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "ref_ce_loss": 0.10176167637109756, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "loss": 0.23953138291835785, + "step": 15310 + }, + { + "ce_loss": 0.07575532048940659, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "distill_loss": 0.07758189737796783, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "ref_ce_loss": 0.04956245794892311, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "loss": 0.3669511079788208, + "step": 15310 + }, + { + "ce_loss": 0.058586835861206055, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "distill_loss": 0.10620009154081345, + "epoch": 5.106737825216811, + "step": 15310 + }, + { + "epoch": 5.106737825216811, + "ref_ce_loss": 0.14764153957366943, + "step": 15310 + }, + { + "epoch": 5.1100733822548365, + "loss": 0.3812, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "grad_norm": 2.4810564517974854, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "learning_rate": 9.147652752759394e-05, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "loss": 0.4458327889442444, + "step": 15320 + }, + { + "ce_loss": 0.09967893362045288, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "distill_loss": 0.13925136625766754, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "ref_ce_loss": 0.07300533354282379, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "loss": 0.47262224555015564, + "step": 15320 + }, + { + "ce_loss": 0.18473461270332336, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "distill_loss": 0.1558404266834259, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "ref_ce_loss": 0.11091091483831406, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "loss": 0.3459745943546295, + "step": 15320 + }, + { + "ce_loss": 0.07840926945209503, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "distill_loss": 0.09304723888635635, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "ref_ce_loss": 0.08125700801610947, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "loss": 0.18763215839862823, + "step": 15320 + }, + { + "ce_loss": 0.034408580511808395, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "distill_loss": 0.0835486575961113, + "epoch": 5.1100733822548365, + "step": 15320 + }, + { + "epoch": 5.1100733822548365, + "ref_ce_loss": 0.045321013778448105, + "step": 15320 + }, + { + "epoch": 5.113408939292862, + "loss": 0.3723, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "grad_norm": 2.5682082176208496, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "learning_rate": 9.129007280532144e-05, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "loss": 0.41082343459129333, + "step": 15330 + }, + { + "ce_loss": 0.12252845615148544, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "distill_loss": 0.1458989977836609, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "ref_ce_loss": 0.10372576117515564, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "loss": 0.26919203996658325, + "step": 15330 + }, + { + "ce_loss": 0.033415187150239944, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "distill_loss": 0.09982331097126007, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "ref_ce_loss": 0.0936480164527893, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "loss": 0.3246932625770569, + "step": 15330 + }, + { + "ce_loss": 0.09546466171741486, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "distill_loss": 0.10523758083581924, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "ref_ce_loss": 0.07602840662002563, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "loss": 0.17769919335842133, + "step": 15330 + }, + { + "ce_loss": 0.01439160481095314, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "distill_loss": 0.08686622977256775, + "epoch": 5.113408939292862, + "step": 15330 + }, + { + "epoch": 5.113408939292862, + "ref_ce_loss": 0.05428411811590195, + "step": 15330 + }, + { + "epoch": 5.116744496330887, + "loss": 0.3794, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "grad_norm": 3.887604236602783, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "learning_rate": 9.110372514679691e-05, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "loss": 0.39693522453308105, + "step": 15340 + }, + { + "ce_loss": 0.11991281062364578, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "distill_loss": 0.12842141091823578, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "ref_ce_loss": 0.07633031904697418, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "loss": 0.5600422620773315, + "step": 15340 + }, + { + "ce_loss": 0.10626500844955444, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "distill_loss": 0.08879688382148743, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "ref_ce_loss": 0.08295416831970215, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "loss": 0.24426814913749695, + "step": 15340 + }, + { + "ce_loss": 0.0578928105533123, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "distill_loss": 0.12534251809120178, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "ref_ce_loss": 0.06088608503341675, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "loss": 0.2940409481525421, + "step": 15340 + }, + { + "ce_loss": 0.033138569444417953, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "distill_loss": 0.11227000504732132, + "epoch": 5.116744496330887, + "step": 15340 + }, + { + "epoch": 5.116744496330887, + "ref_ce_loss": 0.06146222725510597, + "step": 15340 + }, + { + "epoch": 5.120080053368913, + "loss": 0.3713, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "grad_norm": 1.7324687242507935, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "learning_rate": 9.091748489184506e-05, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "loss": 0.32019171118736267, + "step": 15350 + }, + { + "ce_loss": 0.09052632749080658, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "distill_loss": 0.09617231041193008, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "ref_ce_loss": 0.09810184687376022, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "loss": 0.21579107642173767, + "step": 15350 + }, + { + "ce_loss": 0.02102779597043991, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "distill_loss": 0.10144367069005966, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "ref_ce_loss": 0.09315359592437744, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "loss": 0.24842038750648499, + "step": 15350 + }, + { + "ce_loss": 0.09609237313270569, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "distill_loss": 0.08486215770244598, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "ref_ce_loss": 0.06740619242191315, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "loss": 0.2568802237510681, + "step": 15350 + }, + { + "ce_loss": 0.10689549893140793, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "distill_loss": 0.09282051026821136, + "epoch": 5.120080053368913, + "step": 15350 + }, + { + "epoch": 5.120080053368913, + "ref_ce_loss": 0.0404185950756073, + "step": 15350 + }, + { + "epoch": 5.123415610406938, + "loss": 0.2897, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "grad_norm": 2.086287260055542, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "learning_rate": 9.073135238009464e-05, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "loss": 0.24856829643249512, + "step": 15360 + }, + { + "ce_loss": 0.04894977807998657, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "distill_loss": 0.09595238417387009, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "ref_ce_loss": 0.04782169312238693, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "loss": 0.2760925590991974, + "step": 15360 + }, + { + "ce_loss": 0.06440954655408859, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "distill_loss": 0.12960557639598846, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "ref_ce_loss": 0.05290549620985985, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "loss": 0.2818790078163147, + "step": 15360 + }, + { + "ce_loss": 0.05980372801423073, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "distill_loss": 0.14864784479141235, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "ref_ce_loss": 0.05248422548174858, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "loss": 0.21428939700126648, + "step": 15360 + }, + { + "ce_loss": 0.02077779360115528, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "distill_loss": 0.11778127402067184, + "epoch": 5.123415610406938, + "step": 15360 + }, + { + "epoch": 5.123415610406938, + "ref_ce_loss": 0.07553339004516602, + "step": 15360 + }, + { + "epoch": 5.126751167444963, + "loss": 0.3455, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "grad_norm": 3.2243845462799072, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "learning_rate": 9.054532795097787e-05, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "loss": 0.39891764521598816, + "step": 15370 + }, + { + "ce_loss": 0.028408152982592583, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "distill_loss": 0.07895895093679428, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "ref_ce_loss": 0.06767138093709946, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "loss": 0.3717252016067505, + "step": 15370 + }, + { + "ce_loss": 0.056756921112537384, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "distill_loss": 0.1180783361196518, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "ref_ce_loss": 0.04642847180366516, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "loss": 0.3460141122341156, + "step": 15370 + }, + { + "ce_loss": 0.07608392834663391, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "distill_loss": 0.12452583014965057, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "ref_ce_loss": 0.07184207439422607, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "loss": 0.33525413274765015, + "step": 15370 + }, + { + "ce_loss": 0.05221555754542351, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "distill_loss": 0.10671718418598175, + "epoch": 5.126751167444963, + "step": 15370 + }, + { + "epoch": 5.126751167444963, + "ref_ce_loss": 0.06864657998085022, + "step": 15370 + }, + { + "epoch": 5.130086724482989, + "loss": 0.3404, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "grad_norm": 1.6246341466903687, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "learning_rate": 9.035941194373002e-05, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "loss": 0.29942047595977783, + "step": 15380 + }, + { + "ce_loss": 0.07252945005893707, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "distill_loss": 0.11366333812475204, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "ref_ce_loss": 0.08319073170423508, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "loss": 0.9193277359008789, + "step": 15380 + }, + { + "ce_loss": 0.10665949434041977, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "distill_loss": 0.11006686836481094, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "ref_ce_loss": 0.052423495799303055, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "loss": 0.24829605221748352, + "step": 15380 + }, + { + "ce_loss": 0.043987125158309937, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "distill_loss": 0.10426180064678192, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "ref_ce_loss": 0.07357150316238403, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "loss": 0.3645699918270111, + "step": 15380 + }, + { + "ce_loss": 0.10562653839588165, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "distill_loss": 0.11489978432655334, + "epoch": 5.130086724482989, + "step": 15380 + }, + { + "epoch": 5.130086724482989, + "ref_ce_loss": 0.07542836666107178, + "step": 15380 + }, + { + "epoch": 5.133422281521014, + "loss": 0.3531, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "grad_norm": 2.2442030906677246, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "learning_rate": 9.01736046973884e-05, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "loss": 0.4058710038661957, + "step": 15390 + }, + { + "ce_loss": 0.08470144122838974, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "distill_loss": 0.14811912178993225, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "ref_ce_loss": 0.10741455852985382, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "loss": 0.23130950331687927, + "step": 15390 + }, + { + "ce_loss": 0.06238336116075516, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "distill_loss": 0.08404608815908432, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "ref_ce_loss": 0.06146818771958351, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "loss": 0.3462778925895691, + "step": 15390 + }, + { + "ce_loss": 0.08793909847736359, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "distill_loss": 0.11264742165803909, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "ref_ce_loss": 0.04813896492123604, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "loss": 0.25730276107788086, + "step": 15390 + }, + { + "ce_loss": 0.05243609845638275, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "distill_loss": 0.10522018373012543, + "epoch": 5.133422281521014, + "step": 15390 + }, + { + "epoch": 5.133422281521014, + "ref_ce_loss": 0.07645328342914581, + "step": 15390 + }, + { + "epoch": 5.136757838559039, + "loss": 0.3227, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "grad_norm": 3.527695655822754, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "learning_rate": 8.998790655079227e-05, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "loss": 0.519390344619751, + "step": 15400 + }, + { + "ce_loss": 0.08348696678876877, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "distill_loss": 0.1378469467163086, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "ref_ce_loss": 0.10278098285198212, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "loss": 0.32452601194381714, + "step": 15400 + }, + { + "ce_loss": 0.04311065003275871, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "distill_loss": 0.08557398617267609, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "ref_ce_loss": 0.05109192430973053, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "loss": 0.6966140866279602, + "step": 15400 + }, + { + "ce_loss": 0.04068000242114067, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "distill_loss": 0.10309980064630508, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "ref_ce_loss": 0.08326347917318344, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "loss": 0.3585582375526428, + "step": 15400 + }, + { + "ce_loss": 0.10225559026002884, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "distill_loss": 0.08397220820188522, + "epoch": 5.136757838559039, + "step": 15400 + }, + { + "epoch": 5.136757838559039, + "ref_ce_loss": 0.09892605245113373, + "step": 15400 + }, + { + "epoch": 5.140093395597065, + "loss": 0.3735, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "grad_norm": 4.83349609375, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "learning_rate": 8.980231784258181e-05, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "loss": 0.277986615896225, + "step": 15410 + }, + { + "ce_loss": 0.05845331773161888, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "distill_loss": 0.0897456556558609, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "ref_ce_loss": 0.09144359081983566, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "loss": 0.4590834081172943, + "step": 15410 + }, + { + "ce_loss": 0.14646229147911072, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "distill_loss": 0.159382164478302, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "ref_ce_loss": 0.11667078733444214, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "loss": 0.2067529857158661, + "step": 15410 + }, + { + "ce_loss": 0.044581297785043716, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "distill_loss": 0.08390741050243378, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "ref_ce_loss": 0.05592619255185127, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "loss": 0.20260076224803925, + "step": 15410 + }, + { + "ce_loss": 0.02507142350077629, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "distill_loss": 0.07849801331758499, + "epoch": 5.140093395597065, + "step": 15410 + }, + { + "epoch": 5.140093395597065, + "ref_ce_loss": 0.052643340080976486, + "step": 15410 + }, + { + "epoch": 5.14342895263509, + "loss": 0.3161, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "grad_norm": 1.7739144563674927, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "learning_rate": 8.961683891119746e-05, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "loss": 0.18003131449222565, + "step": 15420 + }, + { + "ce_loss": 0.04009293392300606, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "distill_loss": 0.0766187459230423, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "ref_ce_loss": 0.046599678695201874, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "loss": 0.16576485335826874, + "step": 15420 + }, + { + "ce_loss": 0.0029233212117105722, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "distill_loss": 0.08446105569601059, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "ref_ce_loss": 0.0579720139503479, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "loss": 0.5865795612335205, + "step": 15420 + }, + { + "ce_loss": 0.1778968721628189, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "distill_loss": 0.15113386511802673, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "ref_ce_loss": 0.08872532844543457, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "loss": 0.3178934156894684, + "step": 15420 + }, + { + "ce_loss": 0.11432060599327087, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "distill_loss": 0.11662434786558151, + "epoch": 5.14342895263509, + "step": 15420 + }, + { + "epoch": 5.14342895263509, + "ref_ce_loss": 0.08683821558952332, + "step": 15420 + }, + { + "epoch": 5.146764509673115, + "loss": 0.3325, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "grad_norm": 2.481506824493408, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "learning_rate": 8.943147009487982e-05, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "loss": 0.24885477125644684, + "step": 15430 + }, + { + "ce_loss": 0.04392295703291893, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "distill_loss": 0.07781894505023956, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "ref_ce_loss": 0.050707168877124786, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "loss": 0.38043659925460815, + "step": 15430 + }, + { + "ce_loss": 0.11024061590433121, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "distill_loss": 0.1460980325937271, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "ref_ce_loss": 0.0721927061676979, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "loss": 0.2317284792661667, + "step": 15430 + }, + { + "ce_loss": 0.051855847239494324, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "distill_loss": 0.09237996488809586, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "ref_ce_loss": 0.0872517004609108, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "loss": 0.2589787542819977, + "step": 15430 + }, + { + "ce_loss": 0.03987698629498482, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "distill_loss": 0.08046362549066544, + "epoch": 5.146764509673115, + "step": 15430 + }, + { + "epoch": 5.146764509673115, + "ref_ce_loss": 0.034916963428258896, + "step": 15430 + }, + { + "epoch": 5.150100066711141, + "loss": 0.3243, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "grad_norm": 3.888387680053711, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "learning_rate": 8.924621173166832e-05, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "loss": 0.3526419997215271, + "step": 15440 + }, + { + "ce_loss": 0.06011297181248665, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "distill_loss": 0.11874806135892868, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "ref_ce_loss": 0.09072629362344742, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "loss": 0.38294023275375366, + "step": 15440 + }, + { + "ce_loss": 0.1409599930047989, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "distill_loss": 0.14294949173927307, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "ref_ce_loss": 0.09855452924966812, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "loss": 0.22868633270263672, + "step": 15440 + }, + { + "ce_loss": 0.06168137490749359, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "distill_loss": 0.09113404154777527, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "ref_ce_loss": 0.07526343315839767, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "loss": 0.39461877942085266, + "step": 15440 + }, + { + "ce_loss": 0.12180479615926743, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "distill_loss": 0.1197972521185875, + "epoch": 5.150100066711141, + "step": 15440 + }, + { + "epoch": 5.150100066711141, + "ref_ce_loss": 0.08709913492202759, + "step": 15440 + }, + { + "epoch": 5.153435623749166, + "loss": 0.3175, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "grad_norm": 1.7098679542541504, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "learning_rate": 8.906106415940117e-05, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "loss": 0.269246369600296, + "step": 15450 + }, + { + "ce_loss": 0.08349336683750153, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "distill_loss": 0.08976483345031738, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "ref_ce_loss": 0.07661431282758713, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "loss": 0.887234091758728, + "step": 15450 + }, + { + "ce_loss": 0.07439880073070526, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "distill_loss": 0.1128494068980217, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "ref_ce_loss": 0.09533000737428665, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "loss": 0.4059937596321106, + "step": 15450 + }, + { + "ce_loss": 0.06530610471963882, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "distill_loss": 0.09470158070325851, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "ref_ce_loss": 0.06246132403612137, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "loss": 0.25273844599723816, + "step": 15450 + }, + { + "ce_loss": 0.08435259014368057, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "distill_loss": 0.07865085452795029, + "epoch": 5.153435623749166, + "step": 15450 + }, + { + "epoch": 5.153435623749166, + "ref_ce_loss": 0.0672929659485817, + "step": 15450 + }, + { + "epoch": 5.156771180787191, + "loss": 0.3104, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "grad_norm": 2.205923080444336, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "learning_rate": 8.887602771571466e-05, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "loss": 0.19020073115825653, + "step": 15460 + }, + { + "ce_loss": 0.033548977226018906, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "distill_loss": 0.0989307090640068, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "ref_ce_loss": 0.057435162365436554, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "loss": 0.2394929677248001, + "step": 15460 + }, + { + "ce_loss": 0.07328430563211441, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "distill_loss": 0.08450906723737717, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "ref_ce_loss": 0.056285351514816284, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "loss": 0.20999081432819366, + "step": 15460 + }, + { + "ce_loss": 0.029533574357628822, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "distill_loss": 0.07489515841007233, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "ref_ce_loss": 0.07470680773258209, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "loss": 0.22600901126861572, + "step": 15460 + }, + { + "ce_loss": 0.06494138389825821, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "distill_loss": 0.10133645683526993, + "epoch": 5.156771180787191, + "step": 15460 + }, + { + "epoch": 5.156771180787191, + "ref_ce_loss": 0.05953230336308479, + "step": 15460 + }, + { + "epoch": 5.160106737825217, + "loss": 0.3429, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "grad_norm": 2.1586124897003174, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "learning_rate": 8.86911027380421e-05, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "loss": 0.24646462500095367, + "step": 15470 + }, + { + "ce_loss": 0.07469391077756882, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "distill_loss": 0.09838026016950607, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "ref_ce_loss": 0.04848563298583031, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "loss": 0.3478372097015381, + "step": 15470 + }, + { + "ce_loss": 0.08327619731426239, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "distill_loss": 0.13971808552742004, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "ref_ce_loss": 0.10225691646337509, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "loss": 0.20459289848804474, + "step": 15470 + }, + { + "ce_loss": 0.04662002623081207, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "distill_loss": 0.07863349467515945, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "ref_ce_loss": 0.05946270748972893, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "loss": 0.29593658447265625, + "step": 15470 + }, + { + "ce_loss": 0.07237986475229263, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "distill_loss": 0.11817031353712082, + "epoch": 5.160106737825217, + "step": 15470 + }, + { + "epoch": 5.160106737825217, + "ref_ce_loss": 0.04803453013300896, + "step": 15470 + }, + { + "epoch": 5.163442294863242, + "loss": 0.3237, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "grad_norm": 2.479261636734009, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "learning_rate": 8.850628956361376e-05, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "loss": 0.6319223642349243, + "step": 15480 + }, + { + "ce_loss": 0.10117001831531525, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "distill_loss": 0.12282003462314606, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "ref_ce_loss": 0.0641319677233696, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "loss": 0.27091315388679504, + "step": 15480 + }, + { + "ce_loss": 0.08988615870475769, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "distill_loss": 0.1101350411772728, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "ref_ce_loss": 0.0706997960805893, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "loss": 0.3440433144569397, + "step": 15480 + }, + { + "ce_loss": 0.0868951752781868, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "distill_loss": 0.13729919493198395, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "ref_ce_loss": 0.08966067433357239, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "loss": 0.5818476676940918, + "step": 15480 + }, + { + "ce_loss": 0.11106500774621964, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "distill_loss": 0.08393864333629608, + "epoch": 5.163442294863242, + "step": 15480 + }, + { + "epoch": 5.163442294863242, + "ref_ce_loss": 0.07005763053894043, + "step": 15480 + }, + { + "epoch": 5.1667778519012675, + "loss": 0.3475, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "grad_norm": 3.3304479122161865, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "learning_rate": 8.832158852945596e-05, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "loss": 0.1807374358177185, + "step": 15490 + }, + { + "ce_loss": 0.01700657792389393, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "distill_loss": 0.06784248352050781, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "ref_ce_loss": 0.05106615647673607, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "loss": 0.27543309330940247, + "step": 15490 + }, + { + "ce_loss": 0.08672452718019485, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "distill_loss": 0.08889733999967575, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "ref_ce_loss": 0.09958288818597794, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "loss": 0.2450074553489685, + "step": 15490 + }, + { + "ce_loss": 0.0512431301176548, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "distill_loss": 0.10720741748809814, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "ref_ce_loss": 0.0644150972366333, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "loss": 0.14427991211414337, + "step": 15490 + }, + { + "ce_loss": 0.02231277897953987, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "distill_loss": 0.06964313983917236, + "epoch": 5.1667778519012675, + "step": 15490 + }, + { + "epoch": 5.1667778519012675, + "ref_ce_loss": 0.05178777500987053, + "step": 15490 + }, + { + "epoch": 5.170113408939293, + "loss": 0.301, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "grad_norm": 3.6208198070526123, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "learning_rate": 8.813699997239051e-05, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "loss": 0.4033278226852417, + "step": 15500 + }, + { + "ce_loss": 0.14259237051010132, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "distill_loss": 0.10619431734085083, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "ref_ce_loss": 0.10307341068983078, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "loss": 0.3729739487171173, + "step": 15500 + }, + { + "ce_loss": 0.1008945181965828, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "distill_loss": 0.13121455907821655, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "ref_ce_loss": 0.07845688611268997, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "loss": 0.2734251618385315, + "step": 15500 + }, + { + "ce_loss": 0.06486482173204422, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "distill_loss": 0.1164950504899025, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "ref_ce_loss": 0.09184136241674423, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "loss": 0.2883954644203186, + "step": 15500 + }, + { + "ce_loss": 0.03929543495178223, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "distill_loss": 0.10786114633083344, + "epoch": 5.170113408939293, + "step": 15500 + }, + { + "epoch": 5.170113408939293, + "ref_ce_loss": 0.05867098271846771, + "step": 15500 + }, + { + "epoch": 5.173448965977318, + "loss": 0.3157, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "grad_norm": 1.871299386024475, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "learning_rate": 8.795252422903419e-05, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "loss": 0.32258161902427673, + "step": 15510 + }, + { + "ce_loss": 0.08582668751478195, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "distill_loss": 0.11141744256019592, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "ref_ce_loss": 0.08088202774524689, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "loss": 0.7449796795845032, + "step": 15510 + }, + { + "ce_loss": 0.07312709093093872, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "distill_loss": 0.10794167220592499, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "ref_ce_loss": 0.057008083909749985, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "loss": 0.2967376708984375, + "step": 15510 + }, + { + "ce_loss": 0.1069604903459549, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "distill_loss": 0.08719157427549362, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "ref_ce_loss": 0.07663427293300629, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "loss": 0.23122243583202362, + "step": 15510 + }, + { + "ce_loss": 0.04379252344369888, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "distill_loss": 0.06141211465001106, + "epoch": 5.173448965977318, + "step": 15510 + }, + { + "epoch": 5.173448965977318, + "ref_ce_loss": 0.06730645149946213, + "step": 15510 + }, + { + "epoch": 5.1767845230153435, + "loss": 0.3259, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "grad_norm": 3.4619665145874023, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "learning_rate": 8.776816163579793e-05, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "loss": 0.3403814733028412, + "step": 15520 + }, + { + "ce_loss": 0.0703282505273819, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "distill_loss": 0.10586895048618317, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "ref_ce_loss": 0.07988087087869644, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "loss": 0.32925379276275635, + "step": 15520 + }, + { + "ce_loss": 0.07478674501180649, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "distill_loss": 0.11532959342002869, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "ref_ce_loss": 0.07584071159362793, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "loss": 0.3302445709705353, + "step": 15520 + }, + { + "ce_loss": 0.07153227180242538, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "distill_loss": 0.1040826067328453, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "ref_ce_loss": 0.10868798196315765, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "loss": 0.3438566327095032, + "step": 15520 + }, + { + "ce_loss": 0.1533452570438385, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "distill_loss": 0.09927986562252045, + "epoch": 5.1767845230153435, + "step": 15520 + }, + { + "epoch": 5.1767845230153435, + "ref_ce_loss": 0.09111157059669495, + "step": 15520 + }, + { + "epoch": 5.180120080053369, + "loss": 0.3633, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "grad_norm": 2.976386070251465, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "learning_rate": 8.758391252888638e-05, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "loss": 0.27730706334114075, + "step": 15530 + }, + { + "ce_loss": 0.04665299504995346, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "distill_loss": 0.11506421864032745, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "ref_ce_loss": 0.05354463309049606, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "loss": 0.2123173624277115, + "step": 15530 + }, + { + "ce_loss": 0.0355028361082077, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "distill_loss": 0.10253003984689713, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "ref_ce_loss": 0.07338284701108932, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "loss": 0.2025415003299713, + "step": 15530 + }, + { + "ce_loss": 0.06089678779244423, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "distill_loss": 0.07716229557991028, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "ref_ce_loss": 0.045202966779470444, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "loss": 0.2668789029121399, + "step": 15530 + }, + { + "ce_loss": 0.052694350481033325, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "distill_loss": 0.1183210164308548, + "epoch": 5.180120080053369, + "step": 15530 + }, + { + "epoch": 5.180120080053369, + "ref_ce_loss": 0.062336359173059464, + "step": 15530 + }, + { + "epoch": 5.183455637091394, + "loss": 0.3231, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "grad_norm": 2.042680501937866, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "learning_rate": 8.739977724429728e-05, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "loss": 0.24325260519981384, + "step": 15540 + }, + { + "ce_loss": 0.08089492470026016, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "distill_loss": 0.10611046105623245, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "ref_ce_loss": 0.05608109384775162, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "loss": 0.33710145950317383, + "step": 15540 + }, + { + "ce_loss": 0.05630598962306976, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "distill_loss": 0.09411294013261795, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "ref_ce_loss": 0.06278441846370697, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "loss": 0.3103136420249939, + "step": 15540 + }, + { + "ce_loss": 0.0829385444521904, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "distill_loss": 0.09629233926534653, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "ref_ce_loss": 0.07343555986881256, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "loss": 0.32832831144332886, + "step": 15540 + }, + { + "ce_loss": 0.06753069162368774, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "distill_loss": 0.10950666666030884, + "epoch": 5.183455637091394, + "step": 15540 + }, + { + "epoch": 5.183455637091394, + "ref_ce_loss": 0.07036434859037399, + "step": 15540 + }, + { + "epoch": 5.18679119412942, + "loss": 0.2975, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "grad_norm": 2.76190447807312, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "learning_rate": 8.721575611782067e-05, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "loss": 0.5175027847290039, + "step": 15550 + }, + { + "ce_loss": 0.051099736243486404, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "distill_loss": 0.08739770948886871, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "ref_ce_loss": 0.07908324152231216, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "loss": 0.3586389124393463, + "step": 15550 + }, + { + "ce_loss": 0.05912512168288231, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "distill_loss": 0.10872924327850342, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "ref_ce_loss": 0.08318324387073517, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "loss": 0.25400081276893616, + "step": 15550 + }, + { + "ce_loss": 0.041680727154016495, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "distill_loss": 0.1011752262711525, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "ref_ce_loss": 0.037580229341983795, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "loss": 0.28214314579963684, + "step": 15550 + }, + { + "ce_loss": 0.03214351460337639, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "distill_loss": 0.08211422711610794, + "epoch": 5.18679119412942, + "step": 15550 + }, + { + "epoch": 5.18679119412942, + "ref_ce_loss": 0.07695797830820084, + "step": 15550 + }, + { + "epoch": 5.190126751167445, + "loss": 0.3197, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "grad_norm": 1.7665784358978271, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "learning_rate": 8.703184948503859e-05, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "loss": 0.46674734354019165, + "step": 15560 + }, + { + "ce_loss": 0.06610564887523651, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "distill_loss": 0.1039234921336174, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "ref_ce_loss": 0.06362764537334442, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "loss": 0.29293379187583923, + "step": 15560 + }, + { + "ce_loss": 0.1026669591665268, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "distill_loss": 0.08947969228029251, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "ref_ce_loss": 0.07734373211860657, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "loss": 0.36403539776802063, + "step": 15560 + }, + { + "ce_loss": 0.10855984687805176, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "distill_loss": 0.10646361112594604, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "ref_ce_loss": 0.07490496337413788, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "loss": 0.16791392862796783, + "step": 15560 + }, + { + "ce_loss": 0.03769616410136223, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "distill_loss": 0.06060273200273514, + "epoch": 5.190126751167445, + "step": 15560 + }, + { + "epoch": 5.190126751167445, + "ref_ce_loss": 0.05328072980046272, + "step": 15560 + }, + { + "epoch": 5.19346230820547, + "loss": 0.3015, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "grad_norm": 2.2745091915130615, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "learning_rate": 8.684805768132409e-05, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "loss": 0.28142502903938293, + "step": 15570 + }, + { + "ce_loss": 0.05141424387693405, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "distill_loss": 0.07907916605472565, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "ref_ce_loss": 0.06478884071111679, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "loss": 0.25589194893836975, + "step": 15570 + }, + { + "ce_loss": 0.04737861454486847, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "distill_loss": 0.0656491070985794, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "ref_ce_loss": 0.09997665882110596, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "loss": 0.29582786560058594, + "step": 15570 + }, + { + "ce_loss": 0.0774170458316803, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "distill_loss": 0.08943985402584076, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "ref_ce_loss": 0.05729973688721657, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "loss": 0.38386616110801697, + "step": 15570 + }, + { + "ce_loss": 0.05994442105293274, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "distill_loss": 0.11905072629451752, + "epoch": 5.19346230820547, + "step": 15570 + }, + { + "epoch": 5.19346230820547, + "ref_ce_loss": 0.07005809992551804, + "step": 15570 + }, + { + "epoch": 5.196797865243496, + "loss": 0.3038, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "grad_norm": 1.919990062713623, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "learning_rate": 8.666438104184091e-05, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "loss": 0.29969045519828796, + "step": 15580 + }, + { + "ce_loss": 0.07858096808195114, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "distill_loss": 0.10086540132761002, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "ref_ce_loss": 0.052758295089006424, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "loss": 0.3363777697086334, + "step": 15580 + }, + { + "ce_loss": 0.05518624559044838, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "distill_loss": 0.11150319129228592, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "ref_ce_loss": 0.07867610454559326, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "loss": 0.24121202528476715, + "step": 15580 + }, + { + "ce_loss": 0.06729081273078918, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "distill_loss": 0.08743242919445038, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "ref_ce_loss": 0.032432373613119125, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "loss": 0.23406265676021576, + "step": 15580 + }, + { + "ce_loss": 0.057209137827157974, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "distill_loss": 0.07753507792949677, + "epoch": 5.196797865243496, + "step": 15580 + }, + { + "epoch": 5.196797865243496, + "ref_ce_loss": 0.07158780097961426, + "step": 15580 + }, + { + "epoch": 5.200133422281521, + "loss": 0.2973, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "grad_norm": 2.575624942779541, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "learning_rate": 8.648081990154298e-05, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "loss": 0.6412194967269897, + "step": 15590 + }, + { + "ce_loss": 0.105444997549057, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "distill_loss": 0.11191614717245102, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "ref_ce_loss": 0.10877172648906708, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "loss": 0.2196560502052307, + "step": 15590 + }, + { + "ce_loss": 0.038214799016714096, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "distill_loss": 0.1001727506518364, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "ref_ce_loss": 0.06055865436792374, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "loss": 0.22853578627109528, + "step": 15590 + }, + { + "ce_loss": 0.06929147988557816, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "distill_loss": 0.0751098096370697, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "ref_ce_loss": 0.08401620388031006, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "loss": 0.2704339921474457, + "step": 15590 + }, + { + "ce_loss": 0.06760897487401962, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "distill_loss": 0.12975336611270905, + "epoch": 5.200133422281521, + "step": 15590 + }, + { + "epoch": 5.200133422281521, + "ref_ce_loss": 0.07298162579536438, + "step": 15590 + }, + { + "epoch": 5.203468979319546, + "loss": 0.3892, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "grad_norm": 3.99639630317688, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "learning_rate": 8.62973745951732e-05, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "loss": 0.5640752911567688, + "step": 15600 + }, + { + "ce_loss": 0.06329736858606339, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "distill_loss": 0.10339559614658356, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "ref_ce_loss": 0.07652582228183746, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "loss": 0.2741006016731262, + "step": 15600 + }, + { + "ce_loss": 0.039778295904397964, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "distill_loss": 0.08627346158027649, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "ref_ce_loss": 0.08227671682834625, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "loss": 0.5446295142173767, + "step": 15600 + }, + { + "ce_loss": 0.06431914120912552, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "distill_loss": 0.09512020647525787, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "ref_ce_loss": 0.07713976502418518, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "loss": 0.2804473340511322, + "step": 15600 + }, + { + "ce_loss": 0.022517312318086624, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "distill_loss": 0.0781346783041954, + "epoch": 5.203468979319546, + "step": 15600 + }, + { + "epoch": 5.203468979319546, + "ref_ce_loss": 0.06672561168670654, + "step": 15600 + }, + { + "epoch": 5.206804536357572, + "loss": 0.3379, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "grad_norm": 3.4212775230407715, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "learning_rate": 8.61140454572636e-05, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "loss": 0.33479341864585876, + "step": 15610 + }, + { + "ce_loss": 0.036282870918512344, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "distill_loss": 0.08696107566356659, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "ref_ce_loss": 0.03990389034152031, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "loss": 0.3305779695510864, + "step": 15610 + }, + { + "ce_loss": 0.0724162608385086, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "distill_loss": 0.11297457665205002, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "ref_ce_loss": 0.06067904457449913, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "loss": 0.22460030019283295, + "step": 15610 + }, + { + "ce_loss": 0.023353662341833115, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "distill_loss": 0.0965401828289032, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "ref_ce_loss": 0.06605301052331924, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "loss": 0.2593958377838135, + "step": 15610 + }, + { + "ce_loss": 0.03210054337978363, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "distill_loss": 0.08811073005199432, + "epoch": 5.206804536357572, + "step": 15610 + }, + { + "epoch": 5.206804536357572, + "ref_ce_loss": 0.0475555881857872, + "step": 15610 + }, + { + "epoch": 5.210140093395597, + "loss": 0.3564, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "grad_norm": 2.2904813289642334, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "learning_rate": 8.593083282213406e-05, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "loss": 0.23321533203125, + "step": 15620 + }, + { + "ce_loss": 0.07071798294782639, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "distill_loss": 0.09500987827777863, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "ref_ce_loss": 0.0674041360616684, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "loss": 0.30617424845695496, + "step": 15620 + }, + { + "ce_loss": 0.06755051016807556, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "distill_loss": 0.10673744976520538, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "ref_ce_loss": 0.09195166826248169, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "loss": 0.4118172824382782, + "step": 15620 + }, + { + "ce_loss": 0.15275129675865173, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "distill_loss": 0.13609610497951508, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "ref_ce_loss": 0.0815357118844986, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "loss": 0.44408926367759705, + "step": 15620 + }, + { + "ce_loss": 0.10387061536312103, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "distill_loss": 0.08926932513713837, + "epoch": 5.210140093395597, + "step": 15620 + }, + { + "epoch": 5.210140093395597, + "ref_ce_loss": 0.052122075110673904, + "step": 15620 + }, + { + "epoch": 5.213475650433622, + "loss": 0.3449, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "grad_norm": 4.765140533447266, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "learning_rate": 8.574773702389224e-05, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "loss": 0.38619959354400635, + "step": 15630 + }, + { + "ce_loss": 0.07783541828393936, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "distill_loss": 0.09005855023860931, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "ref_ce_loss": 0.08827793598175049, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "loss": 0.3454456925392151, + "step": 15630 + }, + { + "ce_loss": 0.08330843597650528, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "distill_loss": 0.11718885600566864, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "ref_ce_loss": 0.06938067078590393, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "loss": 0.2988470792770386, + "step": 15630 + }, + { + "ce_loss": 0.06816891580820084, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "distill_loss": 0.09233548492193222, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "ref_ce_loss": 0.09514269977807999, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "loss": 0.28821972012519836, + "step": 15630 + }, + { + "ce_loss": 0.06957666575908661, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "distill_loss": 0.09795127809047699, + "epoch": 5.213475650433622, + "step": 15630 + }, + { + "epoch": 5.213475650433622, + "ref_ce_loss": 0.07120463997125626, + "step": 15630 + }, + { + "epoch": 5.216811207471648, + "loss": 0.3606, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "grad_norm": 4.76863956451416, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "learning_rate": 8.556475839643263e-05, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "loss": 0.43406257033348083, + "step": 15640 + }, + { + "ce_loss": 0.10813289135694504, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "distill_loss": 0.13066282868385315, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "ref_ce_loss": 0.09365534037351608, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "loss": 0.34955552220344543, + "step": 15640 + }, + { + "ce_loss": 0.1082388311624527, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "distill_loss": 0.11656052619218826, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "ref_ce_loss": 0.09189897775650024, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "loss": 0.2614520192146301, + "step": 15640 + }, + { + "ce_loss": 0.02691487781703472, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "distill_loss": 0.11289530992507935, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "ref_ce_loss": 0.0644138976931572, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "loss": 0.6140693426132202, + "step": 15640 + }, + { + "ce_loss": 0.04645974934101105, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "distill_loss": 0.11816476285457611, + "epoch": 5.216811207471648, + "step": 15640 + }, + { + "epoch": 5.216811207471648, + "ref_ce_loss": 0.07548999786376953, + "step": 15640 + }, + { + "epoch": 5.220146764509673, + "loss": 0.3625, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "grad_norm": 2.3993215560913086, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "learning_rate": 8.538189727343607e-05, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "loss": 0.2743743360042572, + "step": 15650 + }, + { + "ce_loss": 0.06477046012878418, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "distill_loss": 0.09259688854217529, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "ref_ce_loss": 0.05351939797401428, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "loss": 0.21701106429100037, + "step": 15650 + }, + { + "ce_loss": 0.03109646402299404, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "distill_loss": 0.0818566232919693, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "ref_ce_loss": 0.06140496954321861, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "loss": 0.29552048444747925, + "step": 15650 + }, + { + "ce_loss": 0.09033332020044327, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "distill_loss": 0.09672441333532333, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "ref_ce_loss": 0.08516726642847061, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "loss": 0.21063679456710815, + "step": 15650 + }, + { + "ce_loss": 0.03232778608798981, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "distill_loss": 0.09249082952737808, + "epoch": 5.220146764509673, + "step": 15650 + }, + { + "epoch": 5.220146764509673, + "ref_ce_loss": 0.06132323667407036, + "step": 15650 + }, + { + "epoch": 5.223482321547698, + "loss": 0.3319, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "grad_norm": 2.9076249599456787, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "learning_rate": 8.519915398836912e-05, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "loss": 0.24805200099945068, + "step": 15660 + }, + { + "ce_loss": 0.08938482403755188, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "distill_loss": 0.0876886323094368, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "ref_ce_loss": 0.07093243300914764, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "loss": 0.2605586349964142, + "step": 15660 + }, + { + "ce_loss": 0.05598803982138634, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "distill_loss": 0.10434643924236298, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "ref_ce_loss": 0.07283356040716171, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "loss": 0.4666854739189148, + "step": 15660 + }, + { + "ce_loss": 0.10625138133764267, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "distill_loss": 0.12450964003801346, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "ref_ce_loss": 0.06658724695444107, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "loss": 0.20904622972011566, + "step": 15660 + }, + { + "ce_loss": 0.03591933846473694, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "distill_loss": 0.0913221463561058, + "epoch": 5.223482321547698, + "step": 15660 + }, + { + "epoch": 5.223482321547698, + "ref_ce_loss": 0.08163376152515411, + "step": 15660 + }, + { + "epoch": 5.226817878585724, + "loss": 0.3339, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "grad_norm": 3.4926912784576416, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "learning_rate": 8.501652887448354e-05, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "loss": 0.3464776277542114, + "step": 15670 + }, + { + "ce_loss": 0.053136661648750305, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "distill_loss": 0.13123872876167297, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "ref_ce_loss": 0.09237124770879745, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "loss": 0.4693766236305237, + "step": 15670 + }, + { + "ce_loss": 0.09558730572462082, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "distill_loss": 0.10919008404016495, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "ref_ce_loss": 0.08798684179782867, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "loss": 0.8525584936141968, + "step": 15670 + }, + { + "ce_loss": 0.13166871666908264, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "distill_loss": 0.12148617953062057, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "ref_ce_loss": 0.08422347158193588, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "loss": 0.5308492183685303, + "step": 15670 + }, + { + "ce_loss": 0.12234484404325485, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "distill_loss": 0.1193431094288826, + "epoch": 5.226817878585724, + "step": 15670 + }, + { + "epoch": 5.226817878585724, + "ref_ce_loss": 0.0752011314034462, + "step": 15670 + }, + { + "epoch": 5.230153435623749, + "loss": 0.3824, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "grad_norm": 1.839232087135315, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "learning_rate": 8.483402226481531e-05, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "loss": 0.9070611000061035, + "step": 15680 + }, + { + "ce_loss": 0.10845021158456802, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "distill_loss": 0.15904279053211212, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "ref_ce_loss": 0.10060586035251617, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "loss": 0.20827336609363556, + "step": 15680 + }, + { + "ce_loss": 0.0360378660261631, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "distill_loss": 0.08658089488744736, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "ref_ce_loss": 0.0515529103577137, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "loss": 0.3215510845184326, + "step": 15680 + }, + { + "ce_loss": 0.0539592020213604, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "distill_loss": 0.13022923469543457, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "ref_ce_loss": 0.0769435316324234, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "loss": 0.17728765308856964, + "step": 15680 + }, + { + "ce_loss": 0.020045332610607147, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "distill_loss": 0.072093665599823, + "epoch": 5.230153435623749, + "step": 15680 + }, + { + "epoch": 5.230153435623749, + "ref_ce_loss": 0.06019889935851097, + "step": 15680 + }, + { + "epoch": 5.2334889926617745, + "loss": 0.3081, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "grad_norm": 3.0346570014953613, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "learning_rate": 8.46516344921846e-05, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "loss": 0.27006831765174866, + "step": 15690 + }, + { + "ce_loss": 0.0540575347840786, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "distill_loss": 0.11163632571697235, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "ref_ce_loss": 0.07398149371147156, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "loss": 0.25738635659217834, + "step": 15690 + }, + { + "ce_loss": 0.09728316217660904, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "distill_loss": 0.10439582914113998, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "ref_ce_loss": 0.05562935024499893, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "loss": 0.32098954916000366, + "step": 15690 + }, + { + "ce_loss": 0.04642213135957718, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "distill_loss": 0.07827766239643097, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "ref_ce_loss": 0.07306087762117386, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "loss": 0.34884780645370483, + "step": 15690 + }, + { + "ce_loss": 0.04866236820816994, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "distill_loss": 0.11488591134548187, + "epoch": 5.2334889926617745, + "step": 15690 + }, + { + "epoch": 5.2334889926617745, + "ref_ce_loss": 0.06572486460208893, + "step": 15690 + }, + { + "epoch": 5.2368245496998, + "loss": 0.3341, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "grad_norm": 1.6081671714782715, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "learning_rate": 8.446936588919475e-05, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "loss": 0.3010448217391968, + "step": 15700 + }, + { + "ce_loss": 0.08089329302310944, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "distill_loss": 0.09048581123352051, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "ref_ce_loss": 0.07929427921772003, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "loss": 0.35104861855506897, + "step": 15700 + }, + { + "ce_loss": 0.11544732749462128, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "distill_loss": 0.13719846308231354, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "ref_ce_loss": 0.08232993632555008, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "loss": 0.2056066393852234, + "step": 15700 + }, + { + "ce_loss": 0.04238485172390938, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "distill_loss": 0.08624627441167831, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "ref_ce_loss": 0.04173652455210686, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "loss": 0.24264144897460938, + "step": 15700 + }, + { + "ce_loss": 0.03431288152933121, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "distill_loss": 0.10987095534801483, + "epoch": 5.2368245496998, + "step": 15700 + }, + { + "epoch": 5.2368245496998, + "ref_ce_loss": 0.0590670071542263, + "step": 15700 + }, + { + "epoch": 5.240160106737825, + "loss": 0.305, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "grad_norm": 1.7107326984405518, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "learning_rate": 8.428721678823178e-05, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "loss": 0.22281035780906677, + "step": 15710 + }, + { + "ce_loss": 0.01576351746916771, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "distill_loss": 0.08575361967086792, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "ref_ce_loss": 0.06620697677135468, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "loss": 0.2195720225572586, + "step": 15710 + }, + { + "ce_loss": 0.032049402594566345, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "distill_loss": 0.12723985314369202, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "ref_ce_loss": 0.06024665758013725, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "loss": 0.2883130609989166, + "step": 15710 + }, + { + "ce_loss": 0.06870332360267639, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "distill_loss": 0.11934572458267212, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "ref_ce_loss": 0.10023310780525208, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "loss": 0.28461188077926636, + "step": 15710 + }, + { + "ce_loss": 0.08343104273080826, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "distill_loss": 0.11157909035682678, + "epoch": 5.240160106737825, + "step": 15710 + }, + { + "epoch": 5.240160106737825, + "ref_ce_loss": 0.08946184813976288, + "step": 15710 + }, + { + "epoch": 5.2434956637758505, + "loss": 0.3315, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "grad_norm": 3.451080322265625, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "learning_rate": 8.41051875214639e-05, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "loss": 0.361646831035614, + "step": 15720 + }, + { + "ce_loss": 0.10292549431324005, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "distill_loss": 0.11092507839202881, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "ref_ce_loss": 0.07696814090013504, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "loss": 0.22763587534427643, + "step": 15720 + }, + { + "ce_loss": 0.044729799032211304, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "distill_loss": 0.08984865248203278, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "ref_ce_loss": 0.05932828038930893, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "loss": 0.21750256419181824, + "step": 15720 + }, + { + "ce_loss": 0.0500781424343586, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "distill_loss": 0.08333169668912888, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "ref_ce_loss": 0.07430606335401535, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "loss": 0.3150693476200104, + "step": 15720 + }, + { + "ce_loss": 0.06628770381212234, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "distill_loss": 0.09029305726289749, + "epoch": 5.2434956637758505, + "step": 15720 + }, + { + "epoch": 5.2434956637758505, + "ref_ce_loss": 0.08698195964097977, + "step": 15720 + }, + { + "epoch": 5.246831220813876, + "loss": 0.3561, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "grad_norm": 4.539619445800781, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "learning_rate": 8.392327842084052e-05, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "loss": 0.31229639053344727, + "step": 15730 + }, + { + "ce_loss": 0.0605168417096138, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "distill_loss": 0.12786982953548431, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "ref_ce_loss": 0.1003168374300003, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "loss": 0.3192044794559479, + "step": 15730 + }, + { + "ce_loss": 0.10276396572589874, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "distill_loss": 0.12135171890258789, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "ref_ce_loss": 0.06497390568256378, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "loss": 0.37759923934936523, + "step": 15730 + }, + { + "ce_loss": 0.07716540992259979, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "distill_loss": 0.0948992669582367, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "ref_ce_loss": 0.056300580501556396, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "loss": 0.22098159790039062, + "step": 15730 + }, + { + "ce_loss": 0.04677729308605194, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "distill_loss": 0.11466146260499954, + "epoch": 5.246831220813876, + "step": 15730 + }, + { + "epoch": 5.246831220813876, + "ref_ce_loss": 0.05951519310474396, + "step": 15730 + }, + { + "epoch": 5.250166777851901, + "loss": 0.3294, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "grad_norm": 4.578763008117676, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "learning_rate": 8.374148981809216e-05, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "loss": 0.2728346884250641, + "step": 15740 + }, + { + "ce_loss": 0.06905033439397812, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "distill_loss": 0.09745746850967407, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "ref_ce_loss": 0.08488545566797256, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "loss": 0.4803808927536011, + "step": 15740 + }, + { + "ce_loss": 0.022304605692625046, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "distill_loss": 0.10700996965169907, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "ref_ce_loss": 0.04373454675078392, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "loss": 0.25218603014945984, + "step": 15740 + }, + { + "ce_loss": 0.10156643390655518, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "distill_loss": 0.09463807940483093, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "ref_ce_loss": 0.040142521262168884, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "loss": 0.309573769569397, + "step": 15740 + }, + { + "ce_loss": 0.08238134533166885, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "distill_loss": 0.10327878594398499, + "epoch": 5.250166777851901, + "step": 15740 + }, + { + "epoch": 5.250166777851901, + "ref_ce_loss": 0.09019621461629868, + "step": 15740 + }, + { + "epoch": 5.253502334889927, + "loss": 0.3562, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "grad_norm": 2.2583131790161133, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "learning_rate": 8.355982204472953e-05, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "loss": 0.4027312695980072, + "step": 15750 + }, + { + "ce_loss": 0.057967305183410645, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "distill_loss": 0.12211738526821136, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "ref_ce_loss": 0.051287103444337845, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "loss": 0.25625723600387573, + "step": 15750 + }, + { + "ce_loss": 0.061924465000629425, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "distill_loss": 0.1128692552447319, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "ref_ce_loss": 0.046483397483825684, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "loss": 0.4449954330921173, + "step": 15750 + }, + { + "ce_loss": 0.026363397017121315, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "distill_loss": 0.12303638458251953, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "ref_ce_loss": 0.08882566541433334, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "loss": 0.8488062024116516, + "step": 15750 + }, + { + "ce_loss": 0.07099393010139465, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "distill_loss": 0.10903570055961609, + "epoch": 5.253502334889927, + "step": 15750 + }, + { + "epoch": 5.253502334889927, + "ref_ce_loss": 0.08942622691392899, + "step": 15750 + }, + { + "epoch": 5.256837891927952, + "loss": 0.3294, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "grad_norm": 2.5139570236206055, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "learning_rate": 8.337827543204296e-05, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "loss": 0.24721331894397736, + "step": 15760 + }, + { + "ce_loss": 0.04199664294719696, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "distill_loss": 0.11916942894458771, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "ref_ce_loss": 0.05043063685297966, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "loss": 0.2464386373758316, + "step": 15760 + }, + { + "ce_loss": 0.029941368848085403, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "distill_loss": 0.09624037146568298, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "ref_ce_loss": 0.08408761769533157, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "loss": 0.3620655834674835, + "step": 15760 + }, + { + "ce_loss": 0.11664766073226929, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "distill_loss": 0.12865331768989563, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "ref_ce_loss": 0.06753537803888321, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "loss": 0.16280080378055573, + "step": 15760 + }, + { + "ce_loss": 0.030394496396183968, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "distill_loss": 0.08359411358833313, + "epoch": 5.256837891927952, + "step": 15760 + }, + { + "epoch": 5.256837891927952, + "ref_ce_loss": 0.024733874946832657, + "step": 15760 + }, + { + "epoch": 5.260173448965977, + "loss": 0.3624, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "grad_norm": 2.7002289295196533, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "learning_rate": 8.319685031110196e-05, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "loss": 0.20755645632743835, + "step": 15770 + }, + { + "ce_loss": 0.007387253921478987, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "distill_loss": 0.08905821293592453, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "ref_ce_loss": 0.06709232181310654, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "loss": 0.38863542675971985, + "step": 15770 + }, + { + "ce_loss": 0.1310935616493225, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "distill_loss": 0.15545101463794708, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "ref_ce_loss": 0.08572543412446976, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "loss": 0.3048025965690613, + "step": 15770 + }, + { + "ce_loss": 0.029002025723457336, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "distill_loss": 0.09924504905939102, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "ref_ce_loss": 0.08231877535581589, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "loss": 0.4077030122280121, + "step": 15770 + }, + { + "ce_loss": 0.07996194064617157, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "distill_loss": 0.16033872961997986, + "epoch": 5.260173448965977, + "step": 15770 + }, + { + "epoch": 5.260173448965977, + "ref_ce_loss": 0.0761852115392685, + "step": 15770 + }, + { + "epoch": 5.263509006004003, + "loss": 0.3434, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "grad_norm": 3.2176711559295654, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "learning_rate": 8.301554701275423e-05, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "loss": 0.3810529410839081, + "step": 15780 + }, + { + "ce_loss": 0.1390722543001175, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "distill_loss": 0.12283456325531006, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "ref_ce_loss": 0.0769791379570961, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "loss": 0.42334967851638794, + "step": 15780 + }, + { + "ce_loss": 0.17356954514980316, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "distill_loss": 0.186984121799469, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "ref_ce_loss": 0.06272926926612854, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "loss": 0.28403088450431824, + "step": 15780 + }, + { + "ce_loss": 0.0569252148270607, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "distill_loss": 0.13977956771850586, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "ref_ce_loss": 0.05604541301727295, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "loss": 0.5414626598358154, + "step": 15780 + }, + { + "ce_loss": 0.07642010599374771, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "distill_loss": 0.116451695561409, + "epoch": 5.263509006004003, + "step": 15780 + }, + { + "epoch": 5.263509006004003, + "ref_ce_loss": 0.03948172554373741, + "step": 15780 + }, + { + "epoch": 5.266844563042028, + "loss": 0.3594, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "grad_norm": 3.9597408771514893, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "learning_rate": 8.283436586762556e-05, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "loss": 0.3665906488895416, + "step": 15790 + }, + { + "ce_loss": 0.09277671575546265, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "distill_loss": 0.15249072015285492, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "ref_ce_loss": 0.10048972815275192, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "loss": 0.2455630600452423, + "step": 15790 + }, + { + "ce_loss": 0.0487009696662426, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "distill_loss": 0.092554971575737, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "ref_ce_loss": 0.050037186592817307, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "loss": 0.3036814033985138, + "step": 15790 + }, + { + "ce_loss": 0.0706629678606987, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "distill_loss": 0.12567178905010223, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "ref_ce_loss": 0.07737930864095688, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "loss": 0.2579875886440277, + "step": 15790 + }, + { + "ce_loss": 0.026779716834425926, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "distill_loss": 0.09495803713798523, + "epoch": 5.266844563042028, + "step": 15790 + }, + { + "epoch": 5.266844563042028, + "ref_ce_loss": 0.08277291059494019, + "step": 15790 + }, + { + "epoch": 5.270180120080053, + "loss": 0.3435, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "grad_norm": 2.0573623180389404, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "learning_rate": 8.265330720611883e-05, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "loss": 0.3189752697944641, + "step": 15800 + }, + { + "ce_loss": 0.08407336473464966, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "distill_loss": 0.11684277653694153, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "ref_ce_loss": 0.08707991987466812, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "loss": 0.21804724633693695, + "step": 15800 + }, + { + "ce_loss": 0.03380461037158966, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "distill_loss": 0.10913065075874329, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "ref_ce_loss": 0.056957922875881195, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "loss": 0.33023056387901306, + "step": 15800 + }, + { + "ce_loss": 0.048063404858112335, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "distill_loss": 0.11467498540878296, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "ref_ce_loss": 0.06214866414666176, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "loss": 0.2651209831237793, + "step": 15800 + }, + { + "ce_loss": 0.06443707644939423, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "distill_loss": 0.11779454350471497, + "epoch": 5.270180120080053, + "step": 15800 + }, + { + "epoch": 5.270180120080053, + "ref_ce_loss": 0.06641155481338501, + "step": 15800 + }, + { + "epoch": 5.273515677118079, + "loss": 0.3767, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "grad_norm": 3.061605215072632, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "learning_rate": 8.247237135841367e-05, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "loss": 0.30033445358276367, + "step": 15810 + }, + { + "ce_loss": 0.07328201830387115, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "distill_loss": 0.10676218569278717, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "ref_ce_loss": 0.0811660885810852, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "loss": 0.28703418374061584, + "step": 15810 + }, + { + "ce_loss": 0.04377947747707367, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "distill_loss": 0.1450316160917282, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "ref_ce_loss": 0.05514253303408623, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "loss": 0.35974201560020447, + "step": 15810 + }, + { + "ce_loss": 0.04705319181084633, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "distill_loss": 0.11349070072174072, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "ref_ce_loss": 0.055691562592983246, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "loss": 0.31948322057724, + "step": 15810 + }, + { + "ce_loss": 0.04331135377287865, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "distill_loss": 0.1292535811662674, + "epoch": 5.273515677118079, + "step": 15810 + }, + { + "epoch": 5.273515677118079, + "ref_ce_loss": 0.08645766228437424, + "step": 15810 + }, + { + "epoch": 5.276851234156104, + "loss": 0.366, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "grad_norm": 2.201096534729004, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "learning_rate": 8.229155865446575e-05, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "loss": 0.32561996579170227, + "step": 15820 + }, + { + "ce_loss": 0.1004878580570221, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "distill_loss": 0.09116347879171371, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "ref_ce_loss": 0.07758591324090958, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "loss": 0.19084353744983673, + "step": 15820 + }, + { + "ce_loss": 0.05388427525758743, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "distill_loss": 0.08523177355527878, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "ref_ce_loss": 0.05162469670176506, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "loss": 0.34104123711586, + "step": 15820 + }, + { + "ce_loss": 0.11164067685604095, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "distill_loss": 0.11855912208557129, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "ref_ce_loss": 0.1106509119272232, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "loss": 0.23398077487945557, + "step": 15820 + }, + { + "ce_loss": 0.08900479227304459, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "distill_loss": 0.07735835760831833, + "epoch": 5.276851234156104, + "step": 15820 + }, + { + "epoch": 5.276851234156104, + "ref_ce_loss": 0.05137255787849426, + "step": 15820 + }, + { + "epoch": 5.280186791194129, + "loss": 0.3493, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "grad_norm": 2.384784460067749, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "learning_rate": 8.211086942400596e-05, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "loss": 0.23829659819602966, + "step": 15830 + }, + { + "ce_loss": 0.04226939007639885, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "distill_loss": 0.10614483058452606, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "ref_ce_loss": 0.06560316681861877, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "loss": 0.2921256124973297, + "step": 15830 + }, + { + "ce_loss": 0.05392614006996155, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "distill_loss": 0.1369142383337021, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "ref_ce_loss": 0.05423854663968086, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "loss": 0.2778950035572052, + "step": 15830 + }, + { + "ce_loss": 0.11162376403808594, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "distill_loss": 0.08147963136434555, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "ref_ce_loss": 0.0669204592704773, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "loss": 0.3342251181602478, + "step": 15830 + }, + { + "ce_loss": 0.06660055369138718, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "distill_loss": 0.14492513239383698, + "epoch": 5.280186791194129, + "step": 15830 + }, + { + "epoch": 5.280186791194129, + "ref_ce_loss": 0.09336922317743301, + "step": 15830 + }, + { + "epoch": 5.283522348232155, + "loss": 0.3306, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "grad_norm": 2.8796446323394775, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "learning_rate": 8.193030399654027e-05, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "loss": 0.4431094527244568, + "step": 15840 + }, + { + "ce_loss": 0.11614301055669785, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "distill_loss": 0.09896580129861832, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "ref_ce_loss": 0.06625843048095703, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "loss": 0.2803962528705597, + "step": 15840 + }, + { + "ce_loss": 0.061902016401290894, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "distill_loss": 0.1138758510351181, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "ref_ce_loss": 0.06861334294080734, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "loss": 0.18780617415905, + "step": 15840 + }, + { + "ce_loss": 0.03424258530139923, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "distill_loss": 0.09116180986166, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "ref_ce_loss": 0.04498676210641861, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "loss": 0.36882030963897705, + "step": 15840 + }, + { + "ce_loss": 0.08330081403255463, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "distill_loss": 0.15934601426124573, + "epoch": 5.283522348232155, + "step": 15840 + }, + { + "epoch": 5.283522348232155, + "ref_ce_loss": 0.08243943750858307, + "step": 15840 + }, + { + "epoch": 5.28685790527018, + "loss": 0.3278, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "grad_norm": 2.047136068344116, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "learning_rate": 8.174986270134887e-05, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "loss": 0.2891218066215515, + "step": 15850 + }, + { + "ce_loss": 0.03525947779417038, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "distill_loss": 0.1262560933828354, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "ref_ce_loss": 0.08171357214450836, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "loss": 0.31089332699775696, + "step": 15850 + }, + { + "ce_loss": 0.05321585014462471, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "distill_loss": 0.12867404520511627, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "ref_ce_loss": 0.09286804497241974, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "loss": 0.2964179515838623, + "step": 15850 + }, + { + "ce_loss": 0.07718577980995178, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "distill_loss": 0.12015065550804138, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "ref_ce_loss": 0.07145271450281143, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "loss": 0.19068409502506256, + "step": 15850 + }, + { + "ce_loss": 0.04036962613463402, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "distill_loss": 0.10206376016139984, + "epoch": 5.28685790527018, + "step": 15850 + }, + { + "epoch": 5.28685790527018, + "ref_ce_loss": 0.04821521416306496, + "step": 15850 + }, + { + "epoch": 5.290193462308205, + "loss": 0.3316, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "grad_norm": 1.8144029378890991, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "learning_rate": 8.156954586748528e-05, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "loss": 0.3066614270210266, + "step": 15860 + }, + { + "ce_loss": 0.06004337593913078, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "distill_loss": 0.13541190326213837, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "ref_ce_loss": 0.0621790774166584, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "loss": 0.2303938865661621, + "step": 15860 + }, + { + "ce_loss": 0.0584101639688015, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "distill_loss": 0.0883975550532341, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "ref_ce_loss": 0.05530661344528198, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "loss": 0.4860450029373169, + "step": 15860 + }, + { + "ce_loss": 0.07743857055902481, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "distill_loss": 0.09735628217458725, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "ref_ce_loss": 0.10022785514593124, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "loss": 0.25716519355773926, + "step": 15860 + }, + { + "ce_loss": 0.06579107791185379, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "distill_loss": 0.09430144727230072, + "epoch": 5.290193462308205, + "step": 15860 + }, + { + "epoch": 5.290193462308205, + "ref_ce_loss": 0.05018524080514908, + "step": 15860 + }, + { + "epoch": 5.293529019346231, + "loss": 0.3463, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "grad_norm": 2.370684862136841, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "learning_rate": 8.138935382377653e-05, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "loss": 0.24651719629764557, + "step": 15870 + }, + { + "ce_loss": 0.08347075432538986, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "distill_loss": 0.10999614000320435, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "ref_ce_loss": 0.052934352308511734, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "loss": 0.49767571687698364, + "step": 15870 + }, + { + "ce_loss": 0.11719560623168945, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "distill_loss": 0.13959640264511108, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "ref_ce_loss": 0.12286118417978287, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "loss": 0.3884763717651367, + "step": 15870 + }, + { + "ce_loss": 0.07949317991733551, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "distill_loss": 0.13388612866401672, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "ref_ce_loss": 0.10045711696147919, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "loss": 0.24368393421173096, + "step": 15870 + }, + { + "ce_loss": 0.05252111330628395, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "distill_loss": 0.1036781445145607, + "epoch": 5.293529019346231, + "step": 15870 + }, + { + "epoch": 5.293529019346231, + "ref_ce_loss": 0.05971752852201462, + "step": 15870 + }, + { + "epoch": 5.296864576384256, + "loss": 0.2945, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "grad_norm": 1.8943488597869873, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "learning_rate": 8.120928689882166e-05, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "loss": 0.6232532262802124, + "step": 15880 + }, + { + "ce_loss": 0.0998808890581131, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "distill_loss": 0.09275517612695694, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "ref_ce_loss": 0.08853812515735626, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "loss": 0.45832279324531555, + "step": 15880 + }, + { + "ce_loss": 0.02765379287302494, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "distill_loss": 0.12325002253055573, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "ref_ce_loss": 0.09086009860038757, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "loss": 0.3936709761619568, + "step": 15880 + }, + { + "ce_loss": 0.09706886112689972, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "distill_loss": 0.11568113416433334, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "ref_ce_loss": 0.08714719116687775, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "loss": 0.3076615035533905, + "step": 15880 + }, + { + "ce_loss": 0.09075099229812622, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "distill_loss": 0.10523568093776703, + "epoch": 5.296864576384256, + "step": 15880 + }, + { + "epoch": 5.296864576384256, + "ref_ce_loss": 0.08547863364219666, + "step": 15880 + }, + { + "epoch": 5.3002001334222815, + "loss": 0.3435, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "grad_norm": 4.160302639007568, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "learning_rate": 8.102934542099176e-05, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "loss": 0.3581370413303375, + "step": 15890 + }, + { + "ce_loss": 0.0933266431093216, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "distill_loss": 0.1521734744310379, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "ref_ce_loss": 0.0629705861210823, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "loss": 0.37069106101989746, + "step": 15890 + }, + { + "ce_loss": 0.05081234127283096, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "distill_loss": 0.09451035410165787, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "ref_ce_loss": 0.08984711766242981, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "loss": 0.4759184420108795, + "step": 15890 + }, + { + "ce_loss": 0.05965457856655121, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "distill_loss": 0.09128181636333466, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "ref_ce_loss": 0.08862599730491638, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "loss": 0.24063745141029358, + "step": 15890 + }, + { + "ce_loss": 0.06953100860118866, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "distill_loss": 0.111015185713768, + "epoch": 5.3002001334222815, + "step": 15890 + }, + { + "epoch": 5.3002001334222815, + "ref_ce_loss": 0.06000909581780434, + "step": 15890 + }, + { + "epoch": 5.303535690460307, + "loss": 0.3347, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "grad_norm": 3.271259307861328, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "learning_rate": 8.08495297184292e-05, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "loss": 0.2543596029281616, + "step": 15900 + }, + { + "ce_loss": 0.038422923535108566, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "distill_loss": 0.11588691174983978, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "ref_ce_loss": 0.0747591182589531, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "loss": 0.19756633043289185, + "step": 15900 + }, + { + "ce_loss": 0.03136131539940834, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "distill_loss": 0.09700172394514084, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "ref_ce_loss": 0.053317707031965256, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "loss": 0.4816906452178955, + "step": 15900 + }, + { + "ce_loss": 0.12503883242607117, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "distill_loss": 0.14960643649101257, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "ref_ce_loss": 0.06251875311136246, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "loss": 0.30442604422569275, + "step": 15900 + }, + { + "ce_loss": 0.09769739955663681, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "distill_loss": 0.09515450149774551, + "epoch": 5.303535690460307, + "step": 15900 + }, + { + "epoch": 5.303535690460307, + "ref_ce_loss": 0.09030892699956894, + "step": 15900 + }, + { + "epoch": 5.306871247498332, + "loss": 0.3591, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "grad_norm": 3.0482804775238037, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "learning_rate": 8.066984011904669e-05, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "loss": 0.2710125744342804, + "step": 15910 + }, + { + "ce_loss": 0.06486635655164719, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "distill_loss": 0.0854596421122551, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "ref_ce_loss": 0.0646849274635315, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "loss": 0.3071233332157135, + "step": 15910 + }, + { + "ce_loss": 0.05939917638897896, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "distill_loss": 0.09524413198232651, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "ref_ce_loss": 0.06739193946123123, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "loss": 0.2773936688899994, + "step": 15910 + }, + { + "ce_loss": 0.028691411018371582, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "distill_loss": 0.09702437371015549, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "ref_ce_loss": 0.09791380912065506, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "loss": 0.251413494348526, + "step": 15910 + }, + { + "ce_loss": 0.06908174604177475, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "distill_loss": 0.09879547357559204, + "epoch": 5.306871247498332, + "step": 15910 + }, + { + "epoch": 5.306871247498332, + "ref_ce_loss": 0.05396436154842377, + "step": 15910 + }, + { + "epoch": 5.3102068045363575, + "loss": 0.3403, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "grad_norm": 3.299243927001953, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "learning_rate": 8.049027695052733e-05, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "loss": 0.32065755128860474, + "step": 15920 + }, + { + "ce_loss": 0.10038189589977264, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "distill_loss": 0.12436893582344055, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "ref_ce_loss": 0.0547565296292305, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "loss": 0.8609752058982849, + "step": 15920 + }, + { + "ce_loss": 0.1599891632795334, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "distill_loss": 0.16662029922008514, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "ref_ce_loss": 0.11310508102178574, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "loss": 0.28081509470939636, + "step": 15920 + }, + { + "ce_loss": 0.0595562569797039, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "distill_loss": 0.12638959288597107, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "ref_ce_loss": 0.09455878287553787, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "loss": 0.45147788524627686, + "step": 15920 + }, + { + "ce_loss": 0.08775141090154648, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "distill_loss": 0.10882918536663055, + "epoch": 5.3102068045363575, + "step": 15920 + }, + { + "epoch": 5.3102068045363575, + "ref_ce_loss": 0.10873819887638092, + "step": 15920 + }, + { + "epoch": 5.313542361574383, + "loss": 0.3594, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "grad_norm": 2.0879247188568115, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "learning_rate": 8.031084054032346e-05, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "loss": 0.4031994938850403, + "step": 15930 + }, + { + "ce_loss": 0.05465778335928917, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "distill_loss": 0.14434510469436646, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "ref_ce_loss": 0.07141335308551788, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "loss": 0.20144657790660858, + "step": 15930 + }, + { + "ce_loss": 0.056741055101156235, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "distill_loss": 0.08500836789608002, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "ref_ce_loss": 0.0596328042447567, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "loss": 0.33083662390708923, + "step": 15930 + }, + { + "ce_loss": 0.05826576426625252, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "distill_loss": 0.14457902312278748, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "ref_ce_loss": 0.0672142505645752, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "loss": 0.4372578263282776, + "step": 15930 + }, + { + "ce_loss": 0.11812935769557953, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "distill_loss": 0.1362246423959732, + "epoch": 5.313542361574383, + "step": 15930 + }, + { + "epoch": 5.313542361574383, + "ref_ce_loss": 0.09717027097940445, + "step": 15930 + }, + { + "epoch": 5.316877918612408, + "loss": 0.3511, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "grad_norm": 1.8663395643234253, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "learning_rate": 8.013153121565628e-05, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "loss": 0.26273471117019653, + "step": 15940 + }, + { + "ce_loss": 0.055545687675476074, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "distill_loss": 0.11238569021224976, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "ref_ce_loss": 0.07137372344732285, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "loss": 0.37487444281578064, + "step": 15940 + }, + { + "ce_loss": 0.05269518867135048, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "distill_loss": 0.11019299924373627, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "ref_ce_loss": 0.09977027773857117, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "loss": 0.35324230790138245, + "step": 15940 + }, + { + "ce_loss": 0.057631898671388626, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "distill_loss": 0.09934467077255249, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "ref_ce_loss": 0.07707151770591736, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "loss": 0.29354506731033325, + "step": 15940 + }, + { + "ce_loss": 0.06054156646132469, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "distill_loss": 0.11593253910541534, + "epoch": 5.316877918612408, + "step": 15940 + }, + { + "epoch": 5.316877918612408, + "ref_ce_loss": 0.09283595532178879, + "step": 15940 + }, + { + "epoch": 5.320213475650434, + "loss": 0.3001, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "grad_norm": 2.898165464401245, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "learning_rate": 7.995234930351538e-05, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "loss": 0.3119295835494995, + "step": 15950 + }, + { + "ce_loss": 0.1078239381313324, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "distill_loss": 0.1365964561700821, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "ref_ce_loss": 0.06740289181470871, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "loss": 0.23001202940940857, + "step": 15950 + }, + { + "ce_loss": 0.05635315924882889, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "distill_loss": 0.09114664047956467, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "ref_ce_loss": 0.056779682636260986, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "loss": 0.30822890996932983, + "step": 15950 + }, + { + "ce_loss": 0.1031709685921669, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "distill_loss": 0.12962231040000916, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "ref_ce_loss": 0.05823402479290962, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "loss": 0.2623670995235443, + "step": 15950 + }, + { + "ce_loss": 0.06514668464660645, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "distill_loss": 0.09800301492214203, + "epoch": 5.320213475650434, + "step": 15950 + }, + { + "epoch": 5.320213475650434, + "ref_ce_loss": 0.07522417604923248, + "step": 15950 + }, + { + "epoch": 5.323549032688459, + "loss": 0.357, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "grad_norm": 1.9690219163894653, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "learning_rate": 7.977329513065774e-05, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "loss": 0.26475730538368225, + "step": 15960 + }, + { + "ce_loss": 0.05676068738102913, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "distill_loss": 0.1186981201171875, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "ref_ce_loss": 0.0697123259305954, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "loss": 0.2583927512168884, + "step": 15960 + }, + { + "ce_loss": 0.042958781123161316, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "distill_loss": 0.11501401662826538, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "ref_ce_loss": 0.05779067054390907, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "loss": 0.22738508880138397, + "step": 15960 + }, + { + "ce_loss": 0.07362529635429382, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "distill_loss": 0.10928896069526672, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "ref_ce_loss": 0.044383592903614044, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "loss": 0.3220760226249695, + "step": 15960 + }, + { + "ce_loss": 0.06132000312209129, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "distill_loss": 0.10518957674503326, + "epoch": 5.323549032688459, + "step": 15960 + }, + { + "epoch": 5.323549032688459, + "ref_ce_loss": 0.09508439898490906, + "step": 15960 + }, + { + "epoch": 5.326884589726484, + "loss": 0.2984, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "grad_norm": 3.3353259563446045, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "learning_rate": 7.959436902360762e-05, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "loss": 0.32827815413475037, + "step": 15970 + }, + { + "ce_loss": 0.12878939509391785, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "distill_loss": 0.12784335017204285, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "ref_ce_loss": 0.05205840244889259, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "loss": 0.1396113932132721, + "step": 15970 + }, + { + "ce_loss": 0.014687249436974525, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "distill_loss": 0.07145895808935165, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "ref_ce_loss": 0.05341213196516037, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "loss": 0.25794681906700134, + "step": 15970 + }, + { + "ce_loss": 0.1045415922999382, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "distill_loss": 0.09720547497272491, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "ref_ce_loss": 0.055952686816453934, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "loss": 0.33006682991981506, + "step": 15970 + }, + { + "ce_loss": 0.07984444499015808, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "distill_loss": 0.13971658051013947, + "epoch": 5.326884589726484, + "step": 15970 + }, + { + "epoch": 5.326884589726484, + "ref_ce_loss": 0.1103811264038086, + "step": 15970 + }, + { + "epoch": 5.33022014676451, + "loss": 0.3209, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "grad_norm": 3.469141721725464, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "learning_rate": 7.941557130865565e-05, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "loss": 0.42776209115982056, + "step": 15980 + }, + { + "ce_loss": 0.04647199064493179, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "distill_loss": 0.11702238768339157, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "ref_ce_loss": 0.10681089758872986, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "loss": 0.13588321208953857, + "step": 15980 + }, + { + "ce_loss": 0.013034440577030182, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "distill_loss": 0.06615424901247025, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "ref_ce_loss": 0.04182055965065956, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "loss": 0.35110753774642944, + "step": 15980 + }, + { + "ce_loss": 0.08016197383403778, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "distill_loss": 0.11940719187259674, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "ref_ce_loss": 0.057936813682317734, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "loss": 0.6050047874450684, + "step": 15980 + }, + { + "ce_loss": 0.0430837981402874, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "distill_loss": 0.12484762072563171, + "epoch": 5.33022014676451, + "step": 15980 + }, + { + "epoch": 5.33022014676451, + "ref_ce_loss": 0.08736221492290497, + "step": 15980 + }, + { + "epoch": 5.333555703802535, + "loss": 0.3415, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "grad_norm": 2.1499571800231934, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "learning_rate": 7.923690231185833e-05, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "loss": 0.6607826948165894, + "step": 15990 + }, + { + "ce_loss": 0.055438101291656494, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "distill_loss": 0.14260601997375488, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "ref_ce_loss": 0.08038537204265594, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "loss": 0.5964380502700806, + "step": 15990 + }, + { + "ce_loss": 0.07627970725297928, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "distill_loss": 0.09748944640159607, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "ref_ce_loss": 0.10148187726736069, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "loss": 0.3148542642593384, + "step": 15990 + }, + { + "ce_loss": 0.08906195312738419, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "distill_loss": 0.11657960712909698, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "ref_ce_loss": 0.07575056701898575, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "loss": 0.28883641958236694, + "step": 15990 + }, + { + "ce_loss": 0.05575671046972275, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "distill_loss": 0.12796026468276978, + "epoch": 5.333555703802535, + "step": 15990 + }, + { + "epoch": 5.333555703802535, + "ref_ce_loss": 0.050566114485263824, + "step": 15990 + }, + { + "epoch": 5.33689126084056, + "loss": 0.3842, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "grad_norm": 4.591987133026123, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "learning_rate": 7.905836235903747e-05, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "loss": 0.304741770029068, + "step": 16000 + }, + { + "ce_loss": 0.09324005246162415, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "distill_loss": 0.11520503461360931, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "ref_ce_loss": 0.0958837941288948, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "loss": 0.2455434650182724, + "step": 16000 + }, + { + "ce_loss": 0.05510518327355385, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "distill_loss": 0.11733116209506989, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "ref_ce_loss": 0.03748934715986252, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "loss": 0.2709517776966095, + "step": 16000 + }, + { + "ce_loss": 0.09849530458450317, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "distill_loss": 0.0976298451423645, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "ref_ce_loss": 0.06337139010429382, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "loss": 0.35590457916259766, + "step": 16000 + }, + { + "ce_loss": 0.08008065819740295, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "distill_loss": 0.10405315458774567, + "epoch": 5.33689126084056, + "step": 16000 + }, + { + "epoch": 5.33689126084056, + "ref_ce_loss": 0.09750795364379883, + "step": 16000 + }, + { + "epoch": 5.340226817878586, + "loss": 0.3149, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "grad_norm": 2.3251688480377197, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "learning_rate": 7.887995177577942e-05, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "loss": 0.3700042963027954, + "step": 16010 + }, + { + "ce_loss": 0.07157202810049057, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "distill_loss": 0.11362545937299728, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "ref_ce_loss": 0.05933113023638725, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "loss": 0.6948315501213074, + "step": 16010 + }, + { + "ce_loss": 0.08553972840309143, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "distill_loss": 0.1357281357049942, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "ref_ce_loss": 0.07757649570703506, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "loss": 0.18372049927711487, + "step": 16010 + }, + { + "ce_loss": 0.00982450321316719, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "distill_loss": 0.0782545730471611, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "ref_ce_loss": 0.05903314799070358, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "loss": 0.5596423149108887, + "step": 16010 + }, + { + "ce_loss": 0.07517491281032562, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "distill_loss": 0.1120464950799942, + "epoch": 5.340226817878586, + "step": 16010 + }, + { + "epoch": 5.340226817878586, + "ref_ce_loss": 0.1148577556014061, + "step": 16010 + }, + { + "epoch": 5.343562374916611, + "loss": 0.3711, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "grad_norm": 2.9900999069213867, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "learning_rate": 7.870167088743476e-05, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "loss": 0.29928722977638245, + "step": 16020 + }, + { + "ce_loss": 0.10394757241010666, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "distill_loss": 0.12747430801391602, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "ref_ce_loss": 0.06767245382070541, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "loss": 0.5128778219223022, + "step": 16020 + }, + { + "ce_loss": 0.10501803457736969, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "distill_loss": 0.20294877886772156, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "ref_ce_loss": 0.08334807306528091, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "loss": 0.2776087820529938, + "step": 16020 + }, + { + "ce_loss": 0.07922020554542542, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "distill_loss": 0.13950836658477783, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "ref_ce_loss": 0.05875676870346069, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "loss": 0.5638596415519714, + "step": 16020 + }, + { + "ce_loss": 0.1316651850938797, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "distill_loss": 0.17273733019828796, + "epoch": 5.343562374916611, + "step": 16020 + }, + { + "epoch": 5.343562374916611, + "ref_ce_loss": 0.10516467690467834, + "step": 16020 + }, + { + "epoch": 5.346897931954636, + "loss": 0.3894, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "grad_norm": 2.5836825370788574, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "learning_rate": 7.852352001911752e-05, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "loss": 0.45821037888526917, + "step": 16030 + }, + { + "ce_loss": 0.05548872798681259, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "distill_loss": 0.09916659444570541, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "ref_ce_loss": 0.05415504053235054, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "loss": 0.3140331208705902, + "step": 16030 + }, + { + "ce_loss": 0.08456014841794968, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "distill_loss": 0.10989885777235031, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "ref_ce_loss": 0.056715261191129684, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "loss": 0.26189103722572327, + "step": 16030 + }, + { + "ce_loss": 0.062468308955430984, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "distill_loss": 0.11793512850999832, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "ref_ce_loss": 0.08089719712734222, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "loss": 0.23085886240005493, + "step": 16030 + }, + { + "ce_loss": 0.06034333258867264, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "distill_loss": 0.10786504298448563, + "epoch": 5.346897931954636, + "step": 16030 + }, + { + "epoch": 5.346897931954636, + "ref_ce_loss": 0.04875631257891655, + "step": 16030 + }, + { + "epoch": 5.350233488992662, + "loss": 0.3485, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "grad_norm": 2.6916708946228027, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "learning_rate": 7.834549949570459e-05, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "loss": 0.23726388812065125, + "step": 16040 + }, + { + "ce_loss": 0.058267880231142044, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "distill_loss": 0.11253169924020767, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "ref_ce_loss": 0.06635519862174988, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "loss": 0.3128780126571655, + "step": 16040 + }, + { + "ce_loss": 0.06482836604118347, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "distill_loss": 0.117494598031044, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "ref_ce_loss": 0.0603063702583313, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "loss": 0.2820833623409271, + "step": 16040 + }, + { + "ce_loss": 0.08603330701589584, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "distill_loss": 0.1037563756108284, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "ref_ce_loss": 0.09191624075174332, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "loss": 0.2623083293437958, + "step": 16040 + }, + { + "ce_loss": 0.06306131929159164, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "distill_loss": 0.10764463245868683, + "epoch": 5.350233488992662, + "step": 16040 + }, + { + "epoch": 5.350233488992662, + "ref_ce_loss": 0.06455370038747787, + "step": 16040 + }, + { + "epoch": 5.353569046030687, + "loss": 0.3229, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "grad_norm": 1.7106281518936157, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "learning_rate": 7.816760964183524e-05, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "loss": 0.29164308309555054, + "step": 16050 + }, + { + "ce_loss": 0.013757818378508091, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "distill_loss": 0.1073288768529892, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "ref_ce_loss": 0.05741368606686592, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "loss": 0.3547406494617462, + "step": 16050 + }, + { + "ce_loss": 0.03559216111898422, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "distill_loss": 0.11831948161125183, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "ref_ce_loss": 0.05722172558307648, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "loss": 0.2418895661830902, + "step": 16050 + }, + { + "ce_loss": 0.03584042191505432, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "distill_loss": 0.10411054641008377, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "ref_ce_loss": 0.06963283568620682, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "loss": 0.4107963442802429, + "step": 16050 + }, + { + "ce_loss": 0.14093990623950958, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "distill_loss": 0.16926631331443787, + "epoch": 5.353569046030687, + "step": 16050 + }, + { + "epoch": 5.353569046030687, + "ref_ce_loss": 0.0756896585226059, + "step": 16050 + }, + { + "epoch": 5.356904603068712, + "loss": 0.289, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "grad_norm": 1.9935156106948853, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "learning_rate": 7.798985078191028e-05, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "loss": 0.2680877149105072, + "step": 16060 + }, + { + "ce_loss": 0.038785114884376526, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "distill_loss": 0.10870185494422913, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "ref_ce_loss": 0.0915147140622139, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "loss": 0.23653815686702728, + "step": 16060 + }, + { + "ce_loss": 0.04102471470832825, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "distill_loss": 0.11374015361070633, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "ref_ce_loss": 0.08148464560508728, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "loss": 0.2153933346271515, + "step": 16060 + }, + { + "ce_loss": 0.04689103737473488, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "distill_loss": 0.08903343975543976, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "ref_ce_loss": 0.05705301836133003, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "loss": 0.3498300611972809, + "step": 16060 + }, + { + "ce_loss": 0.08931352943181992, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "distill_loss": 0.12247570604085922, + "epoch": 5.356904603068712, + "step": 16060 + }, + { + "epoch": 5.356904603068712, + "ref_ce_loss": 0.06273306906223297, + "step": 16060 + }, + { + "epoch": 5.360240160106738, + "loss": 0.3439, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "grad_norm": 4.033847808837891, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "learning_rate": 7.781222324009181e-05, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "loss": 0.3406941592693329, + "step": 16070 + }, + { + "ce_loss": 0.052824467420578, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "distill_loss": 0.12347150593996048, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "ref_ce_loss": 0.048340242356061935, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "loss": 0.39220917224884033, + "step": 16070 + }, + { + "ce_loss": 0.09756062924861908, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "distill_loss": 0.12279407680034637, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "ref_ce_loss": 0.079924076795578, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "loss": 0.31329527497291565, + "step": 16070 + }, + { + "ce_loss": 0.06221854314208031, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "distill_loss": 0.12405749410390854, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "ref_ce_loss": 0.08381244540214539, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "loss": 0.32966047525405884, + "step": 16070 + }, + { + "ce_loss": 0.05017072707414627, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "distill_loss": 0.08353301137685776, + "epoch": 5.360240160106738, + "step": 16070 + }, + { + "epoch": 5.360240160106738, + "ref_ce_loss": 0.06393556296825409, + "step": 16070 + }, + { + "epoch": 5.363575717144763, + "loss": 0.3435, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "grad_norm": 2.8178539276123047, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "learning_rate": 7.763472734030239e-05, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "loss": 0.5294066667556763, + "step": 16080 + }, + { + "ce_loss": 0.06446986645460129, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "distill_loss": 0.11884837597608566, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "ref_ce_loss": 0.07165850698947906, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "loss": 0.25549739599227905, + "step": 16080 + }, + { + "ce_loss": 0.038696497678756714, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "distill_loss": 0.0930166095495224, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "ref_ce_loss": 0.09175001084804535, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "loss": 0.2837029993534088, + "step": 16080 + }, + { + "ce_loss": 0.04583244025707245, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "distill_loss": 0.10166701674461365, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "ref_ce_loss": 0.0670868456363678, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "loss": 0.27036380767822266, + "step": 16080 + }, + { + "ce_loss": 0.007987498305737972, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "distill_loss": 0.08497060835361481, + "epoch": 5.363575717144763, + "step": 16080 + }, + { + "epoch": 5.363575717144763, + "ref_ce_loss": 0.05139423906803131, + "step": 16080 + }, + { + "epoch": 5.3669112741827885, + "loss": 0.3336, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "grad_norm": 2.1278111934661865, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "learning_rate": 7.745736340622453e-05, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "loss": 0.31296786665916443, + "step": 16090 + }, + { + "ce_loss": 0.04599738121032715, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "distill_loss": 0.13405273854732513, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "ref_ce_loss": 0.08967866003513336, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "loss": 0.37006106972694397, + "step": 16090 + }, + { + "ce_loss": 0.09404456615447998, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "distill_loss": 0.14915797114372253, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "ref_ce_loss": 0.09474969655275345, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "loss": 0.38359758257865906, + "step": 16090 + }, + { + "ce_loss": 0.1558644324541092, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "distill_loss": 0.12019581347703934, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "ref_ce_loss": 0.08622688800096512, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "loss": 0.3088744878768921, + "step": 16090 + }, + { + "ce_loss": 0.06967803835868835, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "distill_loss": 0.10145791620016098, + "epoch": 5.3669112741827885, + "step": 16090 + }, + { + "epoch": 5.3669112741827885, + "ref_ce_loss": 0.07597280293703079, + "step": 16090 + }, + { + "epoch": 5.370246831220814, + "loss": 0.3361, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "grad_norm": 2.2339279651641846, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "learning_rate": 7.728013176130009e-05, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "loss": 0.20128178596496582, + "step": 16100 + }, + { + "ce_loss": 0.04678283631801605, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "distill_loss": 0.08286233991384506, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "ref_ce_loss": 0.05337541177868843, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "loss": 0.3456668555736542, + "step": 16100 + }, + { + "ce_loss": 0.044075287878513336, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "distill_loss": 0.10809840261936188, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "ref_ce_loss": 0.05553989112377167, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "loss": 0.2573688328266144, + "step": 16100 + }, + { + "ce_loss": 0.06272100657224655, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "distill_loss": 0.11115021258592606, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "ref_ce_loss": 0.06553357094526291, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "loss": 0.315351665019989, + "step": 16100 + }, + { + "ce_loss": 0.06446407735347748, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "distill_loss": 0.10399211943149567, + "epoch": 5.370246831220814, + "step": 16100 + }, + { + "epoch": 5.370246831220814, + "ref_ce_loss": 0.05777052417397499, + "step": 16100 + }, + { + "epoch": 5.373582388258839, + "loss": 0.3533, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "grad_norm": 4.180839538574219, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "learning_rate": 7.710303272872974e-05, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "loss": 0.3932115435600281, + "step": 16110 + }, + { + "ce_loss": 0.11435271799564362, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "distill_loss": 0.14050374925136566, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "ref_ce_loss": 0.07154138386249542, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "loss": 0.6406397223472595, + "step": 16110 + }, + { + "ce_loss": 0.09394969046115875, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "distill_loss": 0.13561400771141052, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "ref_ce_loss": 0.06444769352674484, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "loss": 0.506658136844635, + "step": 16110 + }, + { + "ce_loss": 0.1610911637544632, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "distill_loss": 0.11807151138782501, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "ref_ce_loss": 0.09151536226272583, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "loss": 0.3110598623752594, + "step": 16110 + }, + { + "ce_loss": 0.08785425871610641, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "distill_loss": 0.09481608867645264, + "epoch": 5.373582388258839, + "step": 16110 + }, + { + "epoch": 5.373582388258839, + "ref_ce_loss": 0.07922735810279846, + "step": 16110 + }, + { + "epoch": 5.3769179452968645, + "loss": 0.3283, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "grad_norm": 2.140110969543457, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "learning_rate": 7.69260666314721e-05, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "loss": 0.4330398738384247, + "step": 16120 + }, + { + "ce_loss": 0.13992437720298767, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "distill_loss": 0.1396828144788742, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "ref_ce_loss": 0.08856870234012604, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "loss": 0.2571561336517334, + "step": 16120 + }, + { + "ce_loss": 0.04879157990217209, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "distill_loss": 0.11554741859436035, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "ref_ce_loss": 0.05843444913625717, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "loss": 0.2118985950946808, + "step": 16120 + }, + { + "ce_loss": 0.04279850423336029, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "distill_loss": 0.09749726951122284, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "ref_ce_loss": 0.07149596512317657, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "loss": 0.41140979528427124, + "step": 16120 + }, + { + "ce_loss": 0.11395663022994995, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "distill_loss": 0.11845146119594574, + "epoch": 5.3769179452968645, + "step": 16120 + }, + { + "epoch": 5.3769179452968645, + "ref_ce_loss": 0.05464179068803787, + "step": 16120 + }, + { + "epoch": 5.38025350233489, + "loss": 0.2989, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "grad_norm": 1.9423120021820068, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "learning_rate": 7.67492337922437e-05, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "loss": 0.2500401437282562, + "step": 16130 + }, + { + "ce_loss": 0.01709035225212574, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "distill_loss": 0.09783089905977249, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "ref_ce_loss": 0.058439701795578, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "loss": 0.30501842498779297, + "step": 16130 + }, + { + "ce_loss": 0.09198647737503052, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "distill_loss": 0.09595755487680435, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "ref_ce_loss": 0.08432227373123169, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "loss": 0.40725916624069214, + "step": 16130 + }, + { + "ce_loss": 0.07747211307287216, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "distill_loss": 0.08231954276561737, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "ref_ce_loss": 0.0659797340631485, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "loss": 0.7572364807128906, + "step": 16130 + }, + { + "ce_loss": 0.08224272727966309, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "distill_loss": 0.13095402717590332, + "epoch": 5.38025350233489, + "step": 16130 + }, + { + "epoch": 5.38025350233489, + "ref_ce_loss": 0.08251205086708069, + "step": 16130 + }, + { + "epoch": 5.383589059372915, + "loss": 0.3603, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "grad_norm": 1.8171334266662598, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "learning_rate": 7.657253453351765e-05, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "loss": 0.30455857515335083, + "step": 16140 + }, + { + "ce_loss": 0.10068642348051071, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "distill_loss": 0.11382429301738739, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "ref_ce_loss": 0.06328225135803223, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "loss": 0.38374200463294983, + "step": 16140 + }, + { + "ce_loss": 0.11451509594917297, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "distill_loss": 0.12740738689899445, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "ref_ce_loss": 0.1416238397359848, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "loss": 0.38903430104255676, + "step": 16140 + }, + { + "ce_loss": 0.10609883815050125, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "distill_loss": 0.1371476799249649, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "ref_ce_loss": 0.07509582489728928, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "loss": 0.30147454142570496, + "step": 16140 + }, + { + "ce_loss": 0.049935102462768555, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "distill_loss": 0.09309284389019012, + "epoch": 5.383589059372915, + "step": 16140 + }, + { + "epoch": 5.383589059372915, + "ref_ce_loss": 0.06776655465364456, + "step": 16140 + }, + { + "epoch": 5.386924616410941, + "loss": 0.3575, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "grad_norm": 3.655974864959717, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "learning_rate": 7.639596917752391e-05, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "loss": 0.28991663455963135, + "step": 16150 + }, + { + "ce_loss": 0.04967931658029556, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "distill_loss": 0.13226084411144257, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "ref_ce_loss": 0.07171325385570526, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "loss": 0.4191509485244751, + "step": 16150 + }, + { + "ce_loss": 0.07444164901971817, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "distill_loss": 0.10191843658685684, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "ref_ce_loss": 0.09404486417770386, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "loss": 0.2821800708770752, + "step": 16150 + }, + { + "ce_loss": 0.06736411899328232, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "distill_loss": 0.10012883692979813, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "ref_ce_loss": 0.06843230873346329, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "loss": 0.3395196795463562, + "step": 16150 + }, + { + "ce_loss": 0.07241164892911911, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "distill_loss": 0.11182572692632675, + "epoch": 5.386924616410941, + "step": 16150 + }, + { + "epoch": 5.386924616410941, + "ref_ce_loss": 0.06843020766973495, + "step": 16150 + }, + { + "epoch": 5.390260173448966, + "loss": 0.3259, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "grad_norm": 2.4135913848876953, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "learning_rate": 7.621953804624801e-05, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "loss": 0.38991570472717285, + "step": 16160 + }, + { + "ce_loss": 0.11925956606864929, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "distill_loss": 0.11014068126678467, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "ref_ce_loss": 0.060893464833498, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "loss": 0.23624181747436523, + "step": 16160 + }, + { + "ce_loss": 0.045740626752376556, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "distill_loss": 0.09637036919593811, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "ref_ce_loss": 0.07027741521596909, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "loss": 0.2636074721813202, + "step": 16160 + }, + { + "ce_loss": 0.03585965558886528, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "distill_loss": 0.08669553697109222, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "ref_ce_loss": 0.05847055837512016, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "loss": 0.6537226438522339, + "step": 16160 + }, + { + "ce_loss": 0.08080638200044632, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "distill_loss": 0.1580420434474945, + "epoch": 5.390260173448966, + "step": 16160 + }, + { + "epoch": 5.390260173448966, + "ref_ce_loss": 0.0891571044921875, + "step": 16160 + }, + { + "epoch": 5.393595730486991, + "loss": 0.3647, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "grad_norm": 1.73436439037323, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "learning_rate": 7.604324146143065e-05, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "loss": 0.3857395648956299, + "step": 16170 + }, + { + "ce_loss": 0.040800731629133224, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "distill_loss": 0.08445484936237335, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "ref_ce_loss": 0.07349581271409988, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "loss": 0.36219871044158936, + "step": 16170 + }, + { + "ce_loss": 0.09910664707422256, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "distill_loss": 0.13986824452877045, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "ref_ce_loss": 0.08963736146688461, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "loss": 0.3613256812095642, + "step": 16170 + }, + { + "ce_loss": 0.12792716920375824, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "distill_loss": 0.14017927646636963, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "ref_ce_loss": 0.09304498136043549, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "loss": 0.38659751415252686, + "step": 16170 + }, + { + "ce_loss": 0.06034409999847412, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "distill_loss": 0.0787641704082489, + "epoch": 5.393595730486991, + "step": 16170 + }, + { + "epoch": 5.393595730486991, + "ref_ce_loss": 0.08379752933979034, + "step": 16170 + }, + { + "epoch": 5.396931287525017, + "loss": 0.3448, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "grad_norm": 3.2747390270233154, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "learning_rate": 7.586707974456736e-05, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "loss": 0.35655203461647034, + "step": 16180 + }, + { + "ce_loss": 0.124222531914711, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "distill_loss": 0.1000693291425705, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "ref_ce_loss": 0.10748447477817535, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "loss": 0.4372347295284271, + "step": 16180 + }, + { + "ce_loss": 0.04162885248661041, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "distill_loss": 0.09314451366662979, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "ref_ce_loss": 0.09335581213235855, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "loss": 0.241681307554245, + "step": 16180 + }, + { + "ce_loss": 0.03342455253005028, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "distill_loss": 0.08295845240354538, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "ref_ce_loss": 0.07485225051641464, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "loss": 0.31029412150382996, + "step": 16180 + }, + { + "ce_loss": 0.09854529798030853, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "distill_loss": 0.11103243380784988, + "epoch": 5.396931287525017, + "step": 16180 + }, + { + "epoch": 5.396931287525017, + "ref_ce_loss": 0.06289716809988022, + "step": 16180 + }, + { + "epoch": 5.400266844563042, + "loss": 0.3405, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "grad_norm": 4.361851215362549, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "learning_rate": 7.569105321690752e-05, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "loss": 0.2974907457828522, + "step": 16190 + }, + { + "ce_loss": 0.039553042501211166, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "distill_loss": 0.12818855047225952, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "ref_ce_loss": 0.0701039582490921, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "loss": 0.21528084576129913, + "step": 16190 + }, + { + "ce_loss": 0.06046629697084427, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "distill_loss": 0.08600938320159912, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "ref_ce_loss": 0.05630891025066376, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "loss": 0.2935210168361664, + "step": 16190 + }, + { + "ce_loss": 0.09160295873880386, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "distill_loss": 0.10282592475414276, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "ref_ce_loss": 0.0734093189239502, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "loss": 0.24886572360992432, + "step": 16190 + }, + { + "ce_loss": 0.03738119453191757, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "distill_loss": 0.10953368246555328, + "epoch": 5.400266844563042, + "step": 16190 + }, + { + "epoch": 5.400266844563042, + "ref_ce_loss": 0.04157862067222595, + "step": 16190 + }, + { + "epoch": 5.403602401601067, + "loss": 0.3089, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "grad_norm": 2.205479621887207, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "learning_rate": 7.551516219945406e-05, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "loss": 0.2098427563905716, + "step": 16200 + }, + { + "ce_loss": 0.030090736225247383, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "distill_loss": 0.09558893740177155, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "ref_ce_loss": 0.055292759090662, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "loss": 0.3873053193092346, + "step": 16200 + }, + { + "ce_loss": 0.06212102994322777, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "distill_loss": 0.0861755907535553, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "ref_ce_loss": 0.12519577145576477, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "loss": 0.317695289850235, + "step": 16200 + }, + { + "ce_loss": 0.06708138436079025, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "distill_loss": 0.11926654726266861, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "ref_ce_loss": 0.10601387917995453, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "loss": 0.39536064863204956, + "step": 16200 + }, + { + "ce_loss": 0.13016223907470703, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "distill_loss": 0.09881344437599182, + "epoch": 5.403602401601067, + "step": 16200 + }, + { + "epoch": 5.403602401601067, + "ref_ce_loss": 0.08258476853370667, + "step": 16200 + }, + { + "epoch": 5.406937958639093, + "loss": 0.3505, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "grad_norm": 3.33132004737854, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "learning_rate": 7.533940701296298e-05, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "loss": 0.507758378982544, + "step": 16210 + }, + { + "ce_loss": 0.11343388259410858, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "distill_loss": 0.13680988550186157, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "ref_ce_loss": 0.07887157797813416, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "loss": 0.20069199800491333, + "step": 16210 + }, + { + "ce_loss": 0.02618763968348503, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "distill_loss": 0.08316341787576675, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "ref_ce_loss": 0.06166895106434822, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "loss": 0.343496710062027, + "step": 16210 + }, + { + "ce_loss": 0.07581013441085815, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "distill_loss": 0.11562657356262207, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "ref_ce_loss": 0.07594557106494904, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "loss": 0.3016616404056549, + "step": 16210 + }, + { + "ce_loss": 0.10311080515384674, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "distill_loss": 0.09994910657405853, + "epoch": 5.406937958639093, + "step": 16210 + }, + { + "epoch": 5.406937958639093, + "ref_ce_loss": 0.07250438630580902, + "step": 16210 + }, + { + "epoch": 5.410273515677118, + "loss": 0.3128, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "grad_norm": 2.0081982612609863, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "learning_rate": 7.516378797794228e-05, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "loss": 0.341609925031662, + "step": 16220 + }, + { + "ce_loss": 0.06672375649213791, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "distill_loss": 0.10152589529752731, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "ref_ce_loss": 0.1350449025630951, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "loss": 0.35093531012535095, + "step": 16220 + }, + { + "ce_loss": 0.11491996794939041, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "distill_loss": 0.12406434863805771, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "ref_ce_loss": 0.07030671089887619, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "loss": 0.21612723171710968, + "step": 16220 + }, + { + "ce_loss": 0.0429653525352478, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "distill_loss": 0.107479527592659, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "ref_ce_loss": 0.0655955970287323, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "loss": 0.26148879528045654, + "step": 16220 + }, + { + "ce_loss": 0.00934822577983141, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "distill_loss": 0.08704058080911636, + "epoch": 5.410273515677118, + "step": 16220 + }, + { + "epoch": 5.410273515677118, + "ref_ce_loss": 0.059622831642627716, + "step": 16220 + }, + { + "epoch": 5.413609072715143, + "loss": 0.3393, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "grad_norm": 3.420259475708008, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "learning_rate": 7.49883054146518e-05, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "loss": 0.26101815700531006, + "step": 16230 + }, + { + "ce_loss": 0.03194340318441391, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "distill_loss": 0.07808404415845871, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "ref_ce_loss": 0.06558459252119064, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "loss": 0.4477325677871704, + "step": 16230 + }, + { + "ce_loss": 0.053452376276254654, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "distill_loss": 0.09510692209005356, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "ref_ce_loss": 0.0880439504981041, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "loss": 0.29558783769607544, + "step": 16230 + }, + { + "ce_loss": 0.10143938660621643, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "distill_loss": 0.12191247940063477, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "ref_ce_loss": 0.07203470170497894, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "loss": 0.2230197787284851, + "step": 16230 + }, + { + "ce_loss": 0.053148508071899414, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "distill_loss": 0.10254699736833572, + "epoch": 5.413609072715143, + "step": 16230 + }, + { + "epoch": 5.413609072715143, + "ref_ce_loss": 0.05377613380551338, + "step": 16230 + }, + { + "epoch": 5.416944629753169, + "loss": 0.3343, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "grad_norm": 2.3163440227508545, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "learning_rate": 7.481295964310263e-05, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "loss": 0.27103978395462036, + "step": 16240 + }, + { + "ce_loss": 0.07148025184869766, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "distill_loss": 0.10926101356744766, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "ref_ce_loss": 0.09023928642272949, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "loss": 0.30508822202682495, + "step": 16240 + }, + { + "ce_loss": 0.0872962698340416, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "distill_loss": 0.12328356504440308, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "ref_ce_loss": 0.06537864357233047, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "loss": 0.2479790449142456, + "step": 16240 + }, + { + "ce_loss": 0.05155964568257332, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "distill_loss": 0.09833475947380066, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "ref_ce_loss": 0.059581857174634933, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "loss": 0.4399837851524353, + "step": 16240 + }, + { + "ce_loss": 0.09402266144752502, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "distill_loss": 0.12486834824085236, + "epoch": 5.416944629753169, + "step": 16240 + }, + { + "epoch": 5.416944629753169, + "ref_ce_loss": 0.06681935489177704, + "step": 16240 + }, + { + "epoch": 5.420280186791194, + "loss": 0.3358, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "grad_norm": 2.0072333812713623, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "learning_rate": 7.463775098305612e-05, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "loss": 0.23055459558963776, + "step": 16250 + }, + { + "ce_loss": 0.05385155603289604, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "distill_loss": 0.10702510178089142, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "ref_ce_loss": 0.059084367007017136, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "loss": 0.24993188679218292, + "step": 16250 + }, + { + "ce_loss": 0.048931531608104706, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "distill_loss": 0.10967432707548141, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "ref_ce_loss": 0.05932708829641342, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "loss": 0.5765171647071838, + "step": 16250 + }, + { + "ce_loss": 0.12102974206209183, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "distill_loss": 0.15688693523406982, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "ref_ce_loss": 0.11446768790483475, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "loss": 0.44742411375045776, + "step": 16250 + }, + { + "ce_loss": 0.04978395998477936, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "distill_loss": 0.09400615096092224, + "epoch": 5.420280186791194, + "step": 16250 + }, + { + "epoch": 5.420280186791194, + "ref_ce_loss": 0.11323413252830505, + "step": 16250 + }, + { + "epoch": 5.423615743829219, + "loss": 0.3884, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "grad_norm": 2.7448010444641113, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "learning_rate": 7.446267975402385e-05, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "loss": 0.23236140608787537, + "step": 16260 + }, + { + "ce_loss": 0.06890847533941269, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "distill_loss": 0.09854884445667267, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "ref_ce_loss": 0.06464102864265442, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "loss": 0.2084226757287979, + "step": 16260 + }, + { + "ce_loss": 0.026601558551192284, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "distill_loss": 0.09681437164545059, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "ref_ce_loss": 0.05920984968543053, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "loss": 0.4451378583908081, + "step": 16260 + }, + { + "ce_loss": 0.13203752040863037, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "distill_loss": 0.13270580768585205, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "ref_ce_loss": 0.12923528254032135, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "loss": 0.37428125739097595, + "step": 16260 + }, + { + "ce_loss": 0.13738876581192017, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "distill_loss": 0.1335502564907074, + "epoch": 5.423615743829219, + "step": 16260 + }, + { + "epoch": 5.423615743829219, + "ref_ce_loss": 0.08215318620204926, + "step": 16260 + }, + { + "epoch": 5.426951300867245, + "loss": 0.3326, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "grad_norm": 1.8034840822219849, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "learning_rate": 7.428774627526667e-05, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "loss": 0.597827672958374, + "step": 16270 + }, + { + "ce_loss": 0.0970778539776802, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "distill_loss": 0.09443487972021103, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "ref_ce_loss": 0.07590417563915253, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "loss": 0.3486078381538391, + "step": 16270 + }, + { + "ce_loss": 0.08969951421022415, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "distill_loss": 0.09644579142332077, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "ref_ce_loss": 0.10284632444381714, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "loss": 0.3421056270599365, + "step": 16270 + }, + { + "ce_loss": 0.12405972927808762, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "distill_loss": 0.11241130530834198, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "ref_ce_loss": 0.07050801813602448, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "loss": 0.32441556453704834, + "step": 16270 + }, + { + "ce_loss": 0.04846134036779404, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "distill_loss": 0.08198875188827515, + "epoch": 5.426951300867245, + "step": 16270 + }, + { + "epoch": 5.426951300867245, + "ref_ce_loss": 0.044600147753953934, + "step": 16270 + }, + { + "epoch": 5.43028685790527, + "loss": 0.337, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "grad_norm": 2.3241167068481445, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "learning_rate": 7.411295086579422e-05, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "loss": 0.372223824262619, + "step": 16280 + }, + { + "ce_loss": 0.11412045359611511, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "distill_loss": 0.11776827275753021, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "ref_ce_loss": 0.1010143980383873, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "loss": 0.25290191173553467, + "step": 16280 + }, + { + "ce_loss": 0.023051615804433823, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "distill_loss": 0.07711265236139297, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "ref_ce_loss": 0.0660116896033287, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "loss": 0.2648986876010895, + "step": 16280 + }, + { + "ce_loss": 0.0392785519361496, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "distill_loss": 0.103148452937603, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "ref_ce_loss": 0.06433369219303131, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "loss": 0.32053670287132263, + "step": 16280 + }, + { + "ce_loss": 0.09016153961420059, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "distill_loss": 0.13692647218704224, + "epoch": 5.43028685790527, + "step": 16280 + }, + { + "epoch": 5.43028685790527, + "ref_ce_loss": 0.06246296316385269, + "step": 16280 + }, + { + "epoch": 5.4336224149432955, + "loss": 0.3091, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "grad_norm": 1.8611153364181519, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "learning_rate": 7.393829384436447e-05, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "loss": 0.28557321429252625, + "step": 16290 + }, + { + "ce_loss": 0.06013104319572449, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "distill_loss": 0.107640340924263, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "ref_ce_loss": 0.11766494065523148, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "loss": 0.37696021795272827, + "step": 16290 + }, + { + "ce_loss": 0.025803005322813988, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "distill_loss": 0.08128707110881805, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "ref_ce_loss": 0.0619768463075161, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "loss": 0.33180344104766846, + "step": 16290 + }, + { + "ce_loss": 0.07863081246614456, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "distill_loss": 0.11234287917613983, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "ref_ce_loss": 0.11700798571109772, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "loss": 0.32060012221336365, + "step": 16290 + }, + { + "ce_loss": 0.09802191704511642, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "distill_loss": 0.09087618440389633, + "epoch": 5.4336224149432955, + "step": 16290 + }, + { + "epoch": 5.4336224149432955, + "ref_ce_loss": 0.09540796279907227, + "step": 16290 + }, + { + "epoch": 5.436957971981321, + "loss": 0.3082, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "grad_norm": 1.750720739364624, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "learning_rate": 7.37637755294828e-05, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "loss": 0.31910401582717896, + "step": 16300 + }, + { + "ce_loss": 0.02396697923541069, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "distill_loss": 0.11216147243976593, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "ref_ce_loss": 0.08133500814437866, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "loss": 0.13243257999420166, + "step": 16300 + }, + { + "ce_loss": 0.013941922225058079, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "distill_loss": 0.06792638450860977, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "ref_ce_loss": 0.03205156326293945, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "loss": 0.24062950909137726, + "step": 16300 + }, + { + "ce_loss": 0.06546594202518463, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "distill_loss": 0.09706707298755646, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "ref_ce_loss": 0.07798836380243301, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "loss": 0.3027392029762268, + "step": 16300 + }, + { + "ce_loss": 0.07704725116491318, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "distill_loss": 0.121660515666008, + "epoch": 5.436957971981321, + "step": 16300 + }, + { + "epoch": 5.436957971981321, + "ref_ce_loss": 0.10396517068147659, + "step": 16300 + }, + { + "epoch": 5.440293529019346, + "loss": 0.3203, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "grad_norm": 3.087531805038452, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "learning_rate": 7.358939623940182e-05, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "loss": 0.1709238737821579, + "step": 16310 + }, + { + "ce_loss": 0.04030200093984604, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "distill_loss": 0.08430983126163483, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "ref_ce_loss": 0.04611937701702118, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "loss": 1.0106971263885498, + "step": 16310 + }, + { + "ce_loss": 0.13990041613578796, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "distill_loss": 0.1668081283569336, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "ref_ce_loss": 0.10148528963327408, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "loss": 0.1992446333169937, + "step": 16310 + }, + { + "ce_loss": 0.04668141156435013, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "distill_loss": 0.10151037573814392, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "ref_ce_loss": 0.05095856264233589, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "loss": 0.2627887427806854, + "step": 16310 + }, + { + "ce_loss": 0.08537213504314423, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "distill_loss": 0.10909847170114517, + "epoch": 5.440293529019346, + "step": 16310 + }, + { + "epoch": 5.440293529019346, + "ref_ce_loss": 0.0681203156709671, + "step": 16310 + }, + { + "epoch": 5.4436290860573715, + "loss": 0.3939, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "grad_norm": 2.990647315979004, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "learning_rate": 7.341515629212056e-05, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "loss": 0.4164668321609497, + "step": 16320 + }, + { + "ce_loss": 0.11799903959035873, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "distill_loss": 0.12904855608940125, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "ref_ce_loss": 0.12566408514976501, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "loss": 0.31384703516960144, + "step": 16320 + }, + { + "ce_loss": 0.0404597632586956, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "distill_loss": 0.12027452886104584, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "ref_ce_loss": 0.08330576866865158, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "loss": 0.2575395107269287, + "step": 16320 + }, + { + "ce_loss": 0.01610216125845909, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "distill_loss": 0.12123523652553558, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "ref_ce_loss": 0.05706319212913513, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "loss": 0.20323054492473602, + "step": 16320 + }, + { + "ce_loss": 0.021865028887987137, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "distill_loss": 0.09164980798959732, + "epoch": 5.4436290860573715, + "step": 16320 + }, + { + "epoch": 5.4436290860573715, + "ref_ce_loss": 0.06141813471913338, + "step": 16320 + }, + { + "epoch": 5.446964643095397, + "loss": 0.3603, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "grad_norm": 2.530071496963501, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "learning_rate": 7.324105600538398e-05, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "loss": 0.3814869523048401, + "step": 16330 + }, + { + "ce_loss": 0.07421386986970901, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "distill_loss": 0.11971503496170044, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "ref_ce_loss": 0.087638258934021, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "loss": 0.2262338250875473, + "step": 16330 + }, + { + "ce_loss": 0.04096594080328941, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "distill_loss": 0.09016257524490356, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "ref_ce_loss": 0.05134720727801323, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "loss": 0.2489914894104004, + "step": 16330 + }, + { + "ce_loss": 0.08123141527175903, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "distill_loss": 0.09971652925014496, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "ref_ce_loss": 0.04955803602933884, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "loss": 0.2052384614944458, + "step": 16330 + }, + { + "ce_loss": 0.07025023549795151, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "distill_loss": 0.09092271327972412, + "epoch": 5.446964643095397, + "step": 16330 + }, + { + "epoch": 5.446964643095397, + "ref_ce_loss": 0.04397953301668167, + "step": 16330 + }, + { + "epoch": 5.450300200133422, + "loss": 0.3527, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "grad_norm": 3.414726972579956, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "learning_rate": 7.306709569668236e-05, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "loss": 0.24532371759414673, + "step": 16340 + }, + { + "ce_loss": 0.060081496834754944, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "distill_loss": 0.09829045087099075, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "ref_ce_loss": 0.046170350164175034, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "loss": 0.3321872055530548, + "step": 16340 + }, + { + "ce_loss": 0.08811584860086441, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "distill_loss": 0.1313212364912033, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "ref_ce_loss": 0.08971906453371048, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "loss": 0.4201227128505707, + "step": 16340 + }, + { + "ce_loss": 0.17822621762752533, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "distill_loss": 0.14979350566864014, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "ref_ce_loss": 0.07139469683170319, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "loss": 0.38188281655311584, + "step": 16340 + }, + { + "ce_loss": 0.08560221642255783, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "distill_loss": 0.1589680016040802, + "epoch": 5.450300200133422, + "step": 16340 + }, + { + "epoch": 5.450300200133422, + "ref_ce_loss": 0.11068110167980194, + "step": 16340 + }, + { + "epoch": 5.4536357571714476, + "loss": 0.3255, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "grad_norm": 1.9104830026626587, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "learning_rate": 7.289327568325061e-05, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "loss": 0.2057226002216339, + "step": 16350 + }, + { + "ce_loss": 0.013312791474163532, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "distill_loss": 0.10898993909358978, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "ref_ce_loss": 0.06706269085407257, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "loss": 0.3488602936267853, + "step": 16350 + }, + { + "ce_loss": 0.1517920345067978, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "distill_loss": 0.12502455711364746, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "ref_ce_loss": 0.07182005792856216, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "loss": 0.38063332438468933, + "step": 16350 + }, + { + "ce_loss": 0.0700574442744255, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "distill_loss": 0.13038572669029236, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "ref_ce_loss": 0.07677096873521805, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "loss": 0.17652976512908936, + "step": 16350 + }, + { + "ce_loss": 0.01483121793717146, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "distill_loss": 0.0829794704914093, + "epoch": 5.4536357571714476, + "step": 16350 + }, + { + "epoch": 5.4536357571714476, + "ref_ce_loss": 0.058518633246421814, + "step": 16350 + }, + { + "epoch": 5.456971314209473, + "loss": 0.3239, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "grad_norm": 1.9413048028945923, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "learning_rate": 7.271959628206786e-05, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "loss": 0.20399023592472076, + "step": 16360 + }, + { + "ce_loss": 0.026615489274263382, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "distill_loss": 0.0943148136138916, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "ref_ce_loss": 0.05922786518931389, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "loss": 0.258375346660614, + "step": 16360 + }, + { + "ce_loss": 0.06289554387331009, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "distill_loss": 0.1057371124625206, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "ref_ce_loss": 0.05854816362261772, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "loss": 0.2847128212451935, + "step": 16360 + }, + { + "ce_loss": 0.07973343133926392, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "distill_loss": 0.10827630013227463, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "ref_ce_loss": 0.09657879173755646, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "loss": 0.5999765992164612, + "step": 16360 + }, + { + "ce_loss": 0.12350272387266159, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "distill_loss": 0.1628512442111969, + "epoch": 5.456971314209473, + "step": 16360 + }, + { + "epoch": 5.456971314209473, + "ref_ce_loss": 0.10591977089643478, + "step": 16360 + }, + { + "epoch": 5.460306871247498, + "loss": 0.3582, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "grad_norm": 2.5877041816711426, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "learning_rate": 7.254605780985687e-05, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "loss": 0.31887102127075195, + "step": 16370 + }, + { + "ce_loss": 0.11570335924625397, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "distill_loss": 0.1031806692481041, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "ref_ce_loss": 0.0757158175110817, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "loss": 0.302977591753006, + "step": 16370 + }, + { + "ce_loss": 0.09307489544153214, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "distill_loss": 0.12041888386011124, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "ref_ce_loss": 0.060292188078165054, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "loss": 0.38156723976135254, + "step": 16370 + }, + { + "ce_loss": 0.05984897539019585, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "distill_loss": 0.11454299837350845, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "ref_ce_loss": 0.07398920506238937, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "loss": 0.2736813724040985, + "step": 16370 + }, + { + "ce_loss": 0.11653132736682892, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "distill_loss": 0.10505042225122452, + "epoch": 5.460306871247498, + "step": 16370 + }, + { + "epoch": 5.460306871247498, + "ref_ce_loss": 0.0519915409386158, + "step": 16370 + }, + { + "epoch": 5.463642428285524, + "loss": 0.3778, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "grad_norm": 3.572533369064331, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "learning_rate": 7.237266058308337e-05, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "loss": 0.30562925338745117, + "step": 16380 + }, + { + "ce_loss": 0.07567353546619415, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "distill_loss": 0.11570107936859131, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "ref_ce_loss": 0.07223859429359436, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "loss": 0.2510024905204773, + "step": 16380 + }, + { + "ce_loss": 0.06031372770667076, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "distill_loss": 0.11193616688251495, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "ref_ce_loss": 0.05297223851084709, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "loss": 0.21337854862213135, + "step": 16380 + }, + { + "ce_loss": 0.030846117064356804, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "distill_loss": 0.09279409050941467, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "ref_ce_loss": 0.05850100889801979, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "loss": 0.6940407156944275, + "step": 16380 + }, + { + "ce_loss": 0.06561942398548126, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "distill_loss": 0.08580558747053146, + "epoch": 5.463642428285524, + "step": 16380 + }, + { + "epoch": 5.463642428285524, + "ref_ce_loss": 0.040098849684000015, + "step": 16380 + }, + { + "epoch": 5.466977985323549, + "loss": 0.3414, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "grad_norm": 3.0983810424804688, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "learning_rate": 7.21994049179555e-05, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "loss": 0.28001460433006287, + "step": 16390 + }, + { + "ce_loss": 0.047482822090387344, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "distill_loss": 0.10146171599626541, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "ref_ce_loss": 0.06448373943567276, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "loss": 0.3621969223022461, + "step": 16390 + }, + { + "ce_loss": 0.1023976057767868, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "distill_loss": 0.14057931303977966, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "ref_ce_loss": 0.08347362279891968, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "loss": 0.34943342208862305, + "step": 16390 + }, + { + "ce_loss": 0.05505457520484924, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "distill_loss": 0.11126148700714111, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "ref_ce_loss": 0.08371064811944962, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "loss": 0.2689695358276367, + "step": 16390 + }, + { + "ce_loss": 0.07311775535345078, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "distill_loss": 0.09362819790840149, + "epoch": 5.466977985323549, + "step": 16390 + }, + { + "epoch": 5.466977985323549, + "ref_ce_loss": 0.04809510335326195, + "step": 16390 + }, + { + "epoch": 5.470313542361574, + "loss": 0.3496, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "grad_norm": 2.867830753326416, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "learning_rate": 7.20262911304232e-05, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "loss": 0.3608822524547577, + "step": 16400 + }, + { + "ce_loss": 0.06633630394935608, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "distill_loss": 0.16216693818569183, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "ref_ce_loss": 0.10717830061912537, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "loss": 0.29830560088157654, + "step": 16400 + }, + { + "ce_loss": 0.08958641439676285, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "distill_loss": 0.1087767630815506, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "ref_ce_loss": 0.06897032260894775, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "loss": 0.44855913519859314, + "step": 16400 + }, + { + "ce_loss": 0.16237951815128326, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "distill_loss": 0.13723891973495483, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "ref_ce_loss": 0.1202978566288948, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "loss": 0.2657770812511444, + "step": 16400 + }, + { + "ce_loss": 0.043764788657426834, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "distill_loss": 0.09835869073867798, + "epoch": 5.470313542361574, + "step": 16400 + }, + { + "epoch": 5.470313542361574, + "ref_ce_loss": 0.04671204835176468, + "step": 16400 + }, + { + "epoch": 5.4736490993996, + "loss": 0.3583, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "grad_norm": 2.4175097942352295, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "learning_rate": 7.185331953617774e-05, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "loss": 0.3450283408164978, + "step": 16410 + }, + { + "ce_loss": 0.09441274404525757, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "distill_loss": 0.18175029754638672, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "ref_ce_loss": 0.0529416985809803, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "loss": 0.26326984167099, + "step": 16410 + }, + { + "ce_loss": 0.04440765455365181, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "distill_loss": 0.12239554524421692, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "ref_ce_loss": 0.09604854881763458, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "loss": 0.2444891482591629, + "step": 16410 + }, + { + "ce_loss": 0.04429222643375397, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "distill_loss": 0.1255653202533722, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "ref_ce_loss": 0.07446930557489395, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "loss": 0.3559562861919403, + "step": 16410 + }, + { + "ce_loss": 0.11063433438539505, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "distill_loss": 0.1508217453956604, + "epoch": 5.4736490993996, + "step": 16410 + }, + { + "epoch": 5.4736490993996, + "ref_ce_loss": 0.07082000374794006, + "step": 16410 + }, + { + "epoch": 5.476984656437625, + "loss": 0.4042, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "grad_norm": 2.4823498725891113, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "learning_rate": 7.16804904506511e-05, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "loss": 0.3447977900505066, + "step": 16420 + }, + { + "ce_loss": 0.08455787599086761, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "distill_loss": 0.1442045271396637, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "ref_ce_loss": 0.06625892966985703, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "loss": 0.5153990983963013, + "step": 16420 + }, + { + "ce_loss": 0.0379730723798275, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "distill_loss": 0.13656309247016907, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "ref_ce_loss": 0.0743139386177063, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "loss": 0.5950380563735962, + "step": 16420 + }, + { + "ce_loss": 0.14564505219459534, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "distill_loss": 0.1893148124217987, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "ref_ce_loss": 0.09283700585365295, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "loss": 0.23283891379833221, + "step": 16420 + }, + { + "ce_loss": 0.07694301754236221, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "distill_loss": 0.10071247071027756, + "epoch": 5.476984656437625, + "step": 16420 + }, + { + "epoch": 5.476984656437625, + "ref_ce_loss": 0.05511435121297836, + "step": 16420 + }, + { + "epoch": 5.48032021347565, + "loss": 0.3878, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "grad_norm": 5.472512722015381, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "learning_rate": 7.150780418901537e-05, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "loss": 0.308619886636734, + "step": 16430 + }, + { + "ce_loss": 0.06344080716371536, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "distill_loss": 0.13703030347824097, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "ref_ce_loss": 0.08027543872594833, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "loss": 0.43004536628723145, + "step": 16430 + }, + { + "ce_loss": 0.09899048507213593, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "distill_loss": 0.11898726969957352, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "ref_ce_loss": 0.09035888314247131, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "loss": 0.2632295489311218, + "step": 16430 + }, + { + "ce_loss": 0.06394274532794952, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "distill_loss": 0.1264699250459671, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "ref_ce_loss": 0.05528374761343002, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "loss": 0.2657163143157959, + "step": 16430 + }, + { + "ce_loss": 0.04656196013092995, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "distill_loss": 0.1124621257185936, + "epoch": 5.48032021347565, + "step": 16430 + }, + { + "epoch": 5.48032021347565, + "ref_ce_loss": 0.054182395339012146, + "step": 16430 + }, + { + "epoch": 5.483655770513676, + "loss": 0.36, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "grad_norm": 2.9450929164886475, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "learning_rate": 7.13352610661822e-05, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "loss": 0.4299405515193939, + "step": 16440 + }, + { + "ce_loss": 0.040338076651096344, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "distill_loss": 0.1451733559370041, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "ref_ce_loss": 0.06834650039672852, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "loss": 0.5239918828010559, + "step": 16440 + }, + { + "ce_loss": 0.10494782030582428, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "distill_loss": 0.13455775380134583, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "ref_ce_loss": 0.08458808809518814, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "loss": 0.31824690103530884, + "step": 16440 + }, + { + "ce_loss": 0.07870815694332123, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "distill_loss": 0.11882781237363815, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "ref_ce_loss": 0.05696150287985802, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "loss": 0.36554384231567383, + "step": 16440 + }, + { + "ce_loss": 0.0737508162856102, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "distill_loss": 0.13044539093971252, + "epoch": 5.483655770513676, + "step": 16440 + }, + { + "epoch": 5.483655770513676, + "ref_ce_loss": 0.08092710375785828, + "step": 16440 + }, + { + "epoch": 5.486991327551701, + "loss": 0.3634, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "grad_norm": 1.8394968509674072, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "learning_rate": 7.116286139680208e-05, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "loss": 0.20372389256954193, + "step": 16450 + }, + { + "ce_loss": 0.034144558012485504, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "distill_loss": 0.11417104303836823, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "ref_ce_loss": 0.0553152859210968, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "loss": 0.2984168827533722, + "step": 16450 + }, + { + "ce_loss": 0.07658090442419052, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "distill_loss": 0.10359425842761993, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "ref_ce_loss": 0.07040654867887497, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "loss": 0.2048768848180771, + "step": 16450 + }, + { + "ce_loss": 0.042437851428985596, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "distill_loss": 0.07546865195035934, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "ref_ce_loss": 0.0681789219379425, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "loss": 0.3559339642524719, + "step": 16450 + }, + { + "ce_loss": 0.11100053042173386, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "distill_loss": 0.10700476169586182, + "epoch": 5.486991327551701, + "step": 16450 + }, + { + "epoch": 5.486991327551701, + "ref_ce_loss": 0.09130432456731796, + "step": 16450 + }, + { + "epoch": 5.490326884589726, + "loss": 0.3844, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "grad_norm": 1.9888533353805542, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "learning_rate": 7.099060549526406e-05, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "loss": 0.1936185657978058, + "step": 16460 + }, + { + "ce_loss": 0.007558522745966911, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "distill_loss": 0.07454708963632584, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "ref_ce_loss": 0.043821558356285095, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "loss": 0.4054626524448395, + "step": 16460 + }, + { + "ce_loss": 0.024897685274481773, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "distill_loss": 0.11481006443500519, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "ref_ce_loss": 0.12120971083641052, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "loss": 0.17700545489788055, + "step": 16460 + }, + { + "ce_loss": 0.03312915191054344, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "distill_loss": 0.09140119701623917, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "ref_ce_loss": 0.05226533114910126, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "loss": 0.21900974214076996, + "step": 16460 + }, + { + "ce_loss": 0.041102517396211624, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "distill_loss": 0.10525349527597427, + "epoch": 5.490326884589726, + "step": 16460 + }, + { + "epoch": 5.490326884589726, + "ref_ce_loss": 0.05411553755402565, + "step": 16460 + }, + { + "epoch": 5.493662441627752, + "loss": 0.3449, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "grad_norm": 2.341479539871216, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "learning_rate": 7.081849367569502e-05, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "loss": 0.24781110882759094, + "step": 16470 + }, + { + "ce_loss": 0.02872762642800808, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "distill_loss": 0.09619366377592087, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "ref_ce_loss": 0.07014994323253632, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "loss": 0.3405912220478058, + "step": 16470 + }, + { + "ce_loss": 0.04670342057943344, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "distill_loss": 0.1301906406879425, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "ref_ce_loss": 0.09883410483598709, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "loss": 0.35678622126579285, + "step": 16470 + }, + { + "ce_loss": 0.09243296831846237, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "distill_loss": 0.15158620476722717, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "ref_ce_loss": 0.08452881872653961, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "loss": 0.22663433849811554, + "step": 16470 + }, + { + "ce_loss": 0.03276556730270386, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "distill_loss": 0.08965560793876648, + "epoch": 5.493662441627752, + "step": 16470 + }, + { + "epoch": 5.493662441627752, + "ref_ce_loss": 0.0697120726108551, + "step": 16470 + }, + { + "epoch": 5.496997998665777, + "loss": 0.3554, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "grad_norm": 3.4519755840301514, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "learning_rate": 7.064652625195883e-05, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "loss": 0.32312771677970886, + "step": 16480 + }, + { + "ce_loss": 0.08518663048744202, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "distill_loss": 0.11265785247087479, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "ref_ce_loss": 0.1049276664853096, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "loss": 0.29952967166900635, + "step": 16480 + }, + { + "ce_loss": 0.07962783426046371, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "distill_loss": 0.10899212956428528, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "ref_ce_loss": 0.08677392452955246, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "loss": 0.32281216979026794, + "step": 16480 + }, + { + "ce_loss": 0.06670834124088287, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "distill_loss": 0.14671894907951355, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "ref_ce_loss": 0.0661352351307869, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "loss": 0.27929234504699707, + "step": 16480 + }, + { + "ce_loss": 0.04909088462591171, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "distill_loss": 0.14832495152950287, + "epoch": 5.496997998665777, + "step": 16480 + }, + { + "epoch": 5.496997998665777, + "ref_ce_loss": 0.056601885706186295, + "step": 16480 + }, + { + "epoch": 5.5003335557038024, + "loss": 0.3452, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "grad_norm": 2.604300022125244, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "learning_rate": 7.047470353765648e-05, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "loss": 0.3394431471824646, + "step": 16490 + }, + { + "ce_loss": 0.09183358401060104, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "distill_loss": 0.1244954913854599, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "ref_ce_loss": 0.06953916698694229, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "loss": 0.22680865228176117, + "step": 16490 + }, + { + "ce_loss": 0.04873758554458618, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "distill_loss": 0.10830150544643402, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "ref_ce_loss": 0.03887654468417168, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "loss": 0.32523858547210693, + "step": 16490 + }, + { + "ce_loss": 0.12041735649108887, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "distill_loss": 0.11242323368787766, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "ref_ce_loss": 0.06937728822231293, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "loss": 0.4538167417049408, + "step": 16490 + }, + { + "ce_loss": 0.16250401735305786, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "distill_loss": 0.16419684886932373, + "epoch": 5.5003335557038024, + "step": 16490 + }, + { + "epoch": 5.5003335557038024, + "ref_ce_loss": 0.09326472133398056, + "step": 16490 + }, + { + "epoch": 5.503669112741828, + "loss": 0.3291, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "grad_norm": 2.0353572368621826, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "learning_rate": 7.030302584612467e-05, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "loss": 0.304289311170578, + "step": 16500 + }, + { + "ce_loss": 0.07965365797281265, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "distill_loss": 0.10643766075372696, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "ref_ce_loss": 0.07184035331010818, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "loss": 0.33690714836120605, + "step": 16500 + }, + { + "ce_loss": 0.06654191762208939, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "distill_loss": 0.12142252922058105, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "ref_ce_loss": 0.06164618954062462, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "loss": 0.29911255836486816, + "step": 16500 + }, + { + "ce_loss": 0.060956381261348724, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "distill_loss": 0.13254477083683014, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "ref_ce_loss": 0.05198168754577637, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "loss": 0.3498637080192566, + "step": 16500 + }, + { + "ce_loss": 0.05736403167247772, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "distill_loss": 0.15107011795043945, + "epoch": 5.503669112741828, + "step": 16500 + }, + { + "epoch": 5.503669112741828, + "ref_ce_loss": 0.06633694469928741, + "step": 16500 + }, + { + "epoch": 5.507004669779853, + "loss": 0.3628, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "grad_norm": 2.2993712425231934, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "learning_rate": 7.013149349043581e-05, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "loss": 0.3825063109397888, + "step": 16510 + }, + { + "ce_loss": 0.07082057744264603, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "distill_loss": 0.16355140507221222, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "ref_ce_loss": 0.08108976483345032, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "loss": 0.3623524010181427, + "step": 16510 + }, + { + "ce_loss": 0.15745019912719727, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "distill_loss": 0.11442229896783829, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "ref_ce_loss": 0.06455233693122864, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "loss": 0.26522186398506165, + "step": 16510 + }, + { + "ce_loss": 0.06558328866958618, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "distill_loss": 0.0987500324845314, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "ref_ce_loss": 0.0726059302687645, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "loss": 0.5976641178131104, + "step": 16510 + }, + { + "ce_loss": 0.09752736985683441, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "distill_loss": 0.13297367095947266, + "epoch": 5.507004669779853, + "step": 16510 + }, + { + "epoch": 5.507004669779853, + "ref_ce_loss": 0.07492193579673767, + "step": 16510 + }, + { + "epoch": 5.5103402268178785, + "loss": 0.3752, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "grad_norm": 3.8198392391204834, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "learning_rate": 6.996010678339732e-05, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "loss": 0.43566957116127014, + "step": 16520 + }, + { + "ce_loss": 0.10428964346647263, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "distill_loss": 0.12884977459907532, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "ref_ce_loss": 0.07748550921678543, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "loss": 0.6612772941589355, + "step": 16520 + }, + { + "ce_loss": 0.06919779628515244, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "distill_loss": 0.1307699978351593, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "ref_ce_loss": 0.07698625326156616, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "loss": 0.42775776982307434, + "step": 16520 + }, + { + "ce_loss": 0.058075323700904846, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "distill_loss": 0.10789398103952408, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "ref_ce_loss": 0.07196734100580215, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "loss": 0.19031690061092377, + "step": 16520 + }, + { + "ce_loss": 0.047655943781137466, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "distill_loss": 0.10163514316082001, + "epoch": 5.5103402268178785, + "step": 16520 + }, + { + "epoch": 5.5103402268178785, + "ref_ce_loss": 0.040984734892845154, + "step": 16520 + }, + { + "epoch": 5.513675783855904, + "loss": 0.4149, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "grad_norm": 3.6521718502044678, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "learning_rate": 6.978886603755087e-05, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "loss": 0.30453452467918396, + "step": 16530 + }, + { + "ce_loss": 0.04079536721110344, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "distill_loss": 0.14041092991828918, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "ref_ce_loss": 0.06573795527219772, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "loss": 0.24023286998271942, + "step": 16530 + }, + { + "ce_loss": 0.07542020827531815, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "distill_loss": 0.0940532237291336, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "ref_ce_loss": 0.07066644728183746, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "loss": 0.34776246547698975, + "step": 16530 + }, + { + "ce_loss": 0.04864807799458504, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "distill_loss": 0.12712392210960388, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "ref_ce_loss": 0.07447850704193115, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "loss": 0.43463602662086487, + "step": 16530 + }, + { + "ce_loss": 0.10774467140436172, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "distill_loss": 0.12198501080274582, + "epoch": 5.513675783855904, + "step": 16530 + }, + { + "epoch": 5.513675783855904, + "ref_ce_loss": 0.08402875065803528, + "step": 16530 + }, + { + "epoch": 5.517011340893929, + "loss": 0.3387, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "grad_norm": 2.0748274326324463, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "learning_rate": 6.961777156517198e-05, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "loss": 0.36993128061294556, + "step": 16540 + }, + { + "ce_loss": 0.09083930402994156, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "distill_loss": 0.1188422292470932, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "ref_ce_loss": 0.06648019701242447, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "loss": 0.25275933742523193, + "step": 16540 + }, + { + "ce_loss": 0.06319792568683624, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "distill_loss": 0.09456001222133636, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "ref_ce_loss": 0.05898076295852661, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "loss": 0.5341216325759888, + "step": 16540 + }, + { + "ce_loss": 0.11002179235219955, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "distill_loss": 0.15567591786384583, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "ref_ce_loss": 0.07723324745893478, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "loss": 0.29186636209487915, + "step": 16540 + }, + { + "ce_loss": 0.029037468135356903, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "distill_loss": 0.1176878809928894, + "epoch": 5.517011340893929, + "step": 16540 + }, + { + "epoch": 5.517011340893929, + "ref_ce_loss": 0.05208834260702133, + "step": 16540 + }, + { + "epoch": 5.5203468979319545, + "loss": 0.3831, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "grad_norm": 3.3479764461517334, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "learning_rate": 6.944682367826966e-05, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "loss": 0.3768744468688965, + "step": 16550 + }, + { + "ce_loss": 0.0981643795967102, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "distill_loss": 0.15606792271137238, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "ref_ce_loss": 0.07630893588066101, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "loss": 0.20364636182785034, + "step": 16550 + }, + { + "ce_loss": 0.029577821493148804, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "distill_loss": 0.09735309332609177, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "ref_ce_loss": 0.054588109254837036, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "loss": 0.2448481321334839, + "step": 16550 + }, + { + "ce_loss": 0.07320421189069748, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "distill_loss": 0.10613042116165161, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "ref_ce_loss": 0.06539609283208847, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "loss": 0.23442178964614868, + "step": 16550 + }, + { + "ce_loss": 0.048477523028850555, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "distill_loss": 0.1168837770819664, + "epoch": 5.5203468979319545, + "step": 16550 + }, + { + "epoch": 5.5203468979319545, + "ref_ce_loss": 0.05537949129939079, + "step": 16550 + }, + { + "epoch": 5.52368245496998, + "loss": 0.3322, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "grad_norm": 2.843312978744507, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "learning_rate": 6.927602268858526e-05, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "loss": 0.3520795404911041, + "step": 16560 + }, + { + "ce_loss": 0.09770800173282623, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "distill_loss": 0.13243599236011505, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "ref_ce_loss": 0.06643498688936234, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "loss": 0.3254886567592621, + "step": 16560 + }, + { + "ce_loss": 0.06492993235588074, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "distill_loss": 0.10419841855764389, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "ref_ce_loss": 0.0797518864274025, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "loss": 0.6603392362594604, + "step": 16560 + }, + { + "ce_loss": 0.05518954619765282, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "distill_loss": 0.13722261786460876, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "ref_ce_loss": 0.08445858210325241, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "loss": 0.3482056260108948, + "step": 16560 + }, + { + "ce_loss": 0.03709990531206131, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "distill_loss": 0.12940657138824463, + "epoch": 5.52368245496998, + "step": 16560 + }, + { + "epoch": 5.52368245496998, + "ref_ce_loss": 0.06875383853912354, + "step": 16560 + }, + { + "epoch": 5.527018012008005, + "loss": 0.3693, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "grad_norm": 3.7445974349975586, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "learning_rate": 6.910536890759254e-05, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "loss": 0.30111929774284363, + "step": 16570 + }, + { + "ce_loss": 0.11203796416521072, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "distill_loss": 0.1373513787984848, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "ref_ce_loss": 0.05158596858382225, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "loss": 0.2687418758869171, + "step": 16570 + }, + { + "ce_loss": 0.05072599649429321, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "distill_loss": 0.08930881321430206, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "ref_ce_loss": 0.06740231812000275, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "loss": 0.3890778720378876, + "step": 16570 + }, + { + "ce_loss": 0.12530957162380219, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "distill_loss": 0.11893895268440247, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "ref_ce_loss": 0.09552176296710968, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "loss": 0.48636484146118164, + "step": 16570 + }, + { + "ce_loss": 0.04522004351019859, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "distill_loss": 0.10927541553974152, + "epoch": 5.527018012008005, + "step": 16570 + }, + { + "epoch": 5.527018012008005, + "ref_ce_loss": 0.07342866063117981, + "step": 16570 + }, + { + "epoch": 5.530353569046031, + "loss": 0.3443, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "grad_norm": 2.7234625816345215, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "learning_rate": 6.893486264649653e-05, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "loss": 0.3358912765979767, + "step": 16580 + }, + { + "ce_loss": 0.046590499579906464, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "distill_loss": 0.09500577300786972, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "ref_ce_loss": 0.06674519181251526, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "loss": 0.25684136152267456, + "step": 16580 + }, + { + "ce_loss": 0.04049672558903694, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "distill_loss": 0.10132599622011185, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "ref_ce_loss": 0.09812045097351074, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "loss": 0.47450417280197144, + "step": 16580 + }, + { + "ce_loss": 0.0634898990392685, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "distill_loss": 0.10582137107849121, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "ref_ce_loss": 0.13480916619300842, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "loss": 0.4640018939971924, + "step": 16580 + }, + { + "ce_loss": 0.11096587777137756, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "distill_loss": 0.1723109483718872, + "epoch": 5.530353569046031, + "step": 16580 + }, + { + "epoch": 5.530353569046031, + "ref_ce_loss": 0.07591967284679413, + "step": 16580 + }, + { + "epoch": 5.533689126084056, + "loss": 0.4116, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "grad_norm": 3.5316085815429688, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "learning_rate": 6.876450421623346e-05, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "loss": 0.1337299346923828, + "step": 16590 + }, + { + "ce_loss": 0.007529797963798046, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "distill_loss": 0.08628208190202713, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "ref_ce_loss": 0.03970135375857353, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "loss": 0.27693334221839905, + "step": 16590 + }, + { + "ce_loss": 0.04799002781510353, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "distill_loss": 0.1492634117603302, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "ref_ce_loss": 0.0795946940779686, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "loss": 0.2540445327758789, + "step": 16590 + }, + { + "ce_loss": 0.03331456705927849, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "distill_loss": 0.10557867586612701, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "ref_ce_loss": 0.05052439495921135, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "loss": 0.28934377431869507, + "step": 16590 + }, + { + "ce_loss": 0.07273193448781967, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "distill_loss": 0.13207106292247772, + "epoch": 5.533689126084056, + "step": 16590 + }, + { + "epoch": 5.533689126084056, + "ref_ce_loss": 0.08445105701684952, + "step": 16590 + }, + { + "epoch": 5.537024683122081, + "loss": 0.3145, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "grad_norm": 2.272958993911743, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "learning_rate": 6.859429392746993e-05, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "loss": 0.30334970355033875, + "step": 16600 + }, + { + "ce_loss": 0.07540854066610336, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "distill_loss": 0.11994440108537674, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "ref_ce_loss": 0.07918747514486313, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "loss": 0.38659754395484924, + "step": 16600 + }, + { + "ce_loss": 0.13602618873119354, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "distill_loss": 0.17836399376392365, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "ref_ce_loss": 0.07202507555484772, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "loss": 0.6665504574775696, + "step": 16600 + }, + { + "ce_loss": 0.0435519814491272, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "distill_loss": 0.1457093060016632, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "ref_ce_loss": 0.07195711135864258, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "loss": 0.22605876624584198, + "step": 16600 + }, + { + "ce_loss": 0.04164009541273117, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "distill_loss": 0.08552563190460205, + "epoch": 5.537024683122081, + "step": 16600 + }, + { + "epoch": 5.537024683122081, + "ref_ce_loss": 0.07331196218729019, + "step": 16600 + }, + { + "epoch": 5.540360240160107, + "loss": 0.3445, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "grad_norm": 2.327739715576172, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "learning_rate": 6.842423209060233e-05, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "loss": 0.4476955533027649, + "step": 16610 + }, + { + "ce_loss": 0.0371936596930027, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "distill_loss": 0.10962027311325073, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "ref_ce_loss": 0.053445056080818176, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "loss": 0.3170121908187866, + "step": 16610 + }, + { + "ce_loss": 0.03845048323273659, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "distill_loss": 0.13532476127147675, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "ref_ce_loss": 0.07792802900075912, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "loss": 0.26157912611961365, + "step": 16610 + }, + { + "ce_loss": 0.05625046417117119, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "distill_loss": 0.13596224784851074, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "ref_ce_loss": 0.0490339994430542, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "loss": 0.24408766627311707, + "step": 16610 + }, + { + "ce_loss": 0.03240533545613289, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "distill_loss": 0.09369600564241409, + "epoch": 5.540360240160107, + "step": 16610 + }, + { + "epoch": 5.540360240160107, + "ref_ce_loss": 0.0623743012547493, + "step": 16610 + }, + { + "epoch": 5.543695797198132, + "loss": 0.3579, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "grad_norm": 1.9174339771270752, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "learning_rate": 6.825431901575645e-05, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "loss": 0.6768718957901001, + "step": 16620 + }, + { + "ce_loss": 0.0892643928527832, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "distill_loss": 0.12472718209028244, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "ref_ce_loss": 0.08057060092687607, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "loss": 0.7044631838798523, + "step": 16620 + }, + { + "ce_loss": 0.06510592997074127, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "distill_loss": 0.1234467476606369, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "ref_ce_loss": 0.07243905961513519, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "loss": 0.3674314022064209, + "step": 16620 + }, + { + "ce_loss": 0.09846795350313187, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "distill_loss": 0.1294548213481903, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "ref_ce_loss": 0.10479249805212021, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "loss": 0.2651240825653076, + "step": 16620 + }, + { + "ce_loss": 0.06598225235939026, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "distill_loss": 0.1323862224817276, + "epoch": 5.543695797198132, + "step": 16620 + }, + { + "epoch": 5.543695797198132, + "ref_ce_loss": 0.06659011542797089, + "step": 16620 + }, + { + "epoch": 5.547031354236157, + "loss": 0.3521, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "grad_norm": 3.350158214569092, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "learning_rate": 6.808455501278659e-05, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "loss": 0.31689751148223877, + "step": 16630 + }, + { + "ce_loss": 0.07073936611413956, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "distill_loss": 0.13998320698738098, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "ref_ce_loss": 0.08537984639406204, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "loss": 0.26067429780960083, + "step": 16630 + }, + { + "ce_loss": 0.0695922002196312, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "distill_loss": 0.10241564363241196, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "ref_ce_loss": 0.06054640933871269, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "loss": 0.3345610499382019, + "step": 16630 + }, + { + "ce_loss": 0.06958122551441193, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "distill_loss": 0.100101038813591, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "ref_ce_loss": 0.08606253564357758, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "loss": 0.39710739254951477, + "step": 16630 + }, + { + "ce_loss": 0.08828610181808472, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "distill_loss": 0.1490803062915802, + "epoch": 5.547031354236157, + "step": 16630 + }, + { + "epoch": 5.547031354236157, + "ref_ce_loss": 0.0988338366150856, + "step": 16630 + }, + { + "epoch": 5.550366911274183, + "loss": 0.3204, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "grad_norm": 2.270860195159912, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "learning_rate": 6.791494039127539e-05, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "loss": 0.563323438167572, + "step": 16640 + }, + { + "ce_loss": 0.10518639534711838, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "distill_loss": 0.10006602853536606, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "ref_ce_loss": 0.06689028441905975, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "loss": 0.40847182273864746, + "step": 16640 + }, + { + "ce_loss": 0.06803660839796066, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "distill_loss": 0.15362216532230377, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "ref_ce_loss": 0.08404377102851868, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "loss": 0.4165043830871582, + "step": 16640 + }, + { + "ce_loss": 0.10225572437047958, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "distill_loss": 0.15723662078380585, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "ref_ce_loss": 0.08742760866880417, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "loss": 0.22630423307418823, + "step": 16640 + }, + { + "ce_loss": 0.0508527010679245, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "distill_loss": 0.09829200059175491, + "epoch": 5.550366911274183, + "step": 16640 + }, + { + "epoch": 5.550366911274183, + "ref_ce_loss": 0.05572168529033661, + "step": 16640 + }, + { + "epoch": 5.553702468312208, + "loss": 0.3596, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "grad_norm": 4.383035659790039, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "learning_rate": 6.774547546053297e-05, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "loss": 0.24793292582035065, + "step": 16650 + }, + { + "ce_loss": 0.04041785001754761, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "distill_loss": 0.1391821801662445, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "ref_ce_loss": 0.06805215030908585, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "loss": 0.30265313386917114, + "step": 16650 + }, + { + "ce_loss": 0.0523945614695549, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "distill_loss": 0.11704914271831512, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "ref_ce_loss": 0.06724908947944641, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "loss": 0.2269441783428192, + "step": 16650 + }, + { + "ce_loss": 0.029113974422216415, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "distill_loss": 0.11111169308423996, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "ref_ce_loss": 0.08629629015922546, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "loss": 0.2849983870983124, + "step": 16650 + }, + { + "ce_loss": 0.05142313987016678, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "distill_loss": 0.1463957130908966, + "epoch": 5.553702468312208, + "step": 16650 + }, + { + "epoch": 5.553702468312208, + "ref_ce_loss": 0.0440007783472538, + "step": 16650 + }, + { + "epoch": 5.557038025350233, + "loss": 0.3277, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "grad_norm": 2.9877946376800537, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "learning_rate": 6.757616052959658e-05, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "loss": 0.21352145075798035, + "step": 16660 + }, + { + "ce_loss": 0.03965906798839569, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "distill_loss": 0.09861264377832413, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "ref_ce_loss": 0.04607783630490303, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "loss": 0.38374829292297363, + "step": 16660 + }, + { + "ce_loss": 0.10164055228233337, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "distill_loss": 0.11329170316457748, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "ref_ce_loss": 0.0692281424999237, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "loss": 0.8557072877883911, + "step": 16660 + }, + { + "ce_loss": 0.046634022146463394, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "distill_loss": 0.1687537282705307, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "ref_ce_loss": 0.06543728709220886, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "loss": 0.19127684831619263, + "step": 16660 + }, + { + "ce_loss": 0.0667652115225792, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "distill_loss": 0.095591239631176, + "epoch": 5.557038025350233, + "step": 16660 + }, + { + "epoch": 5.557038025350233, + "ref_ce_loss": 0.028839105740189552, + "step": 16660 + }, + { + "epoch": 5.560373582388259, + "loss": 0.3787, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "grad_norm": 3.7834699153900146, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "learning_rate": 6.740699590722982e-05, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "loss": 0.4098147749900818, + "step": 16670 + }, + { + "ce_loss": 0.06272386014461517, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "distill_loss": 0.1474025994539261, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "ref_ce_loss": 0.10220678150653839, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "loss": 0.31686991453170776, + "step": 16670 + }, + { + "ce_loss": 0.1106472909450531, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "distill_loss": 0.1048339232802391, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "ref_ce_loss": 0.08176649361848831, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "loss": 0.4467318058013916, + "step": 16670 + }, + { + "ce_loss": 0.09130599349737167, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "distill_loss": 0.22026273608207703, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "ref_ce_loss": 0.0889739841222763, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "loss": 0.25014933943748474, + "step": 16670 + }, + { + "ce_loss": 0.06914155930280685, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "distill_loss": 0.12035161256790161, + "epoch": 5.560373582388259, + "step": 16670 + }, + { + "epoch": 5.560373582388259, + "ref_ce_loss": 0.06039172783493996, + "step": 16670 + }, + { + "epoch": 5.563709139426284, + "loss": 0.3791, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "grad_norm": 3.1715545654296875, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "learning_rate": 6.72379819019223e-05, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "loss": 1.6280040740966797, + "step": 16680 + }, + { + "ce_loss": 0.1090499684214592, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "distill_loss": 0.14834822714328766, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "ref_ce_loss": 0.10466254502534866, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "loss": 0.24755717813968658, + "step": 16680 + }, + { + "ce_loss": 0.07180020958185196, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "distill_loss": 0.09066848456859589, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "ref_ce_loss": 0.06948643922805786, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "loss": 0.16598784923553467, + "step": 16680 + }, + { + "ce_loss": 0.006515104323625565, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "distill_loss": 0.07459402829408646, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "ref_ce_loss": 0.0633934810757637, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "loss": 0.34224605560302734, + "step": 16680 + }, + { + "ce_loss": 0.058937061578035355, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "distill_loss": 0.12465833127498627, + "epoch": 5.563709139426284, + "step": 16680 + }, + { + "epoch": 5.563709139426284, + "ref_ce_loss": 0.06454360485076904, + "step": 16680 + }, + { + "epoch": 5.567044696464309, + "loss": 0.403, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "grad_norm": 4.215590953826904, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "learning_rate": 6.706911882188879e-05, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "loss": 0.33820870518684387, + "step": 16690 + }, + { + "ce_loss": 0.08671735227108002, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "distill_loss": 0.12246132642030716, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "ref_ce_loss": 0.0622439831495285, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "loss": 0.40078848600387573, + "step": 16690 + }, + { + "ce_loss": 0.08150546252727509, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "distill_loss": 0.21842795610427856, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "ref_ce_loss": 0.08130494505167007, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "loss": 0.4324227273464203, + "step": 16690 + }, + { + "ce_loss": 0.11700266599655151, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "distill_loss": 0.13477078080177307, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "ref_ce_loss": 0.10167690366506577, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "loss": 0.2886402904987335, + "step": 16690 + }, + { + "ce_loss": 0.054880108684301376, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "distill_loss": 0.10805704444646835, + "epoch": 5.567044696464309, + "step": 16690 + }, + { + "epoch": 5.567044696464309, + "ref_ce_loss": 0.08718326687812805, + "step": 16690 + }, + { + "epoch": 5.570380253502335, + "loss": 0.3885, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "grad_norm": 3.0666604042053223, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "learning_rate": 6.690040697506896e-05, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "loss": 0.32792338728904724, + "step": 16700 + }, + { + "ce_loss": 0.07738234847784042, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "distill_loss": 0.11659729480743408, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "ref_ce_loss": 0.06254538148641586, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "loss": 0.3328701853752136, + "step": 16700 + }, + { + "ce_loss": 0.11399929225444794, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "distill_loss": 0.10265770554542542, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "ref_ce_loss": 0.09002488851547241, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "loss": 0.3892509341239929, + "step": 16700 + }, + { + "ce_loss": 0.06272214651107788, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "distill_loss": 0.12995900213718414, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "ref_ce_loss": 0.07331440597772598, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "loss": 0.6979650259017944, + "step": 16700 + }, + { + "ce_loss": 0.06049128994345665, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "distill_loss": 0.37069612741470337, + "epoch": 5.570380253502335, + "step": 16700 + }, + { + "epoch": 5.570380253502335, + "ref_ce_loss": 0.11396878212690353, + "step": 16700 + }, + { + "epoch": 5.57371581054036, + "loss": 0.3821, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "grad_norm": 2.9326987266540527, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "learning_rate": 6.673184666912672e-05, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "loss": 0.4939331114292145, + "step": 16710 + }, + { + "ce_loss": 0.11444830149412155, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "distill_loss": 0.21305085718631744, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "ref_ce_loss": 0.12908244132995605, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "loss": 0.26652368903160095, + "step": 16710 + }, + { + "ce_loss": 0.07186118513345718, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "distill_loss": 0.11902876943349838, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "ref_ce_loss": 0.04031604155898094, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "loss": 0.43540751934051514, + "step": 16710 + }, + { + "ce_loss": 0.09456142783164978, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "distill_loss": 0.1499258279800415, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "ref_ce_loss": 0.09525250643491745, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "loss": 0.19972451031208038, + "step": 16710 + }, + { + "ce_loss": 0.02131311222910881, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "distill_loss": 0.09276192635297775, + "epoch": 5.57371581054036, + "step": 16710 + }, + { + "epoch": 5.57371581054036, + "ref_ce_loss": 0.05993827059864998, + "step": 16710 + }, + { + "epoch": 5.5770513675783855, + "loss": 0.3731, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "grad_norm": 2.389400005340576, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "learning_rate": 6.656343821144956e-05, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "loss": 0.38022905588150024, + "step": 16720 + }, + { + "ce_loss": 0.10960319638252258, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "distill_loss": 0.22096657752990723, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "ref_ce_loss": 0.033225253224372864, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "loss": 0.35766640305519104, + "step": 16720 + }, + { + "ce_loss": 0.09015137702226639, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "distill_loss": 0.1479002833366394, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "ref_ce_loss": 0.09554192423820496, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "loss": 0.3077741265296936, + "step": 16720 + }, + { + "ce_loss": 0.06773196160793304, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "distill_loss": 0.11214911192655563, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "ref_ce_loss": 0.07687600702047348, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "loss": 0.5804701447486877, + "step": 16720 + }, + { + "ce_loss": 0.07880949229001999, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "distill_loss": 0.2357291579246521, + "epoch": 5.5770513675783855, + "step": 16720 + }, + { + "epoch": 5.5770513675783855, + "ref_ce_loss": 0.08584926277399063, + "step": 16720 + }, + { + "epoch": 5.580386924616411, + "loss": 0.3451, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "grad_norm": 1.8142964839935303, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "learning_rate": 6.639518190914808e-05, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "loss": 0.3560985326766968, + "step": 16730 + }, + { + "ce_loss": 0.06115071102976799, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "distill_loss": 0.14061114192008972, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "ref_ce_loss": 0.1131257712841034, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "loss": 0.2548341155052185, + "step": 16730 + }, + { + "ce_loss": 0.08115708082914352, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "distill_loss": 0.09377031028270721, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "ref_ce_loss": 0.07978391647338867, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "loss": 0.2637752294540405, + "step": 16730 + }, + { + "ce_loss": 0.04890935868024826, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "distill_loss": 0.12748581171035767, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "ref_ce_loss": 0.06210982799530029, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "loss": 0.308973103761673, + "step": 16730 + }, + { + "ce_loss": 0.06964000314474106, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "distill_loss": 0.09779088199138641, + "epoch": 5.580386924616411, + "step": 16730 + }, + { + "epoch": 5.580386924616411, + "ref_ce_loss": 0.0733722448348999, + "step": 16730 + }, + { + "epoch": 5.583722481654436, + "loss": 0.3738, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "grad_norm": 5.687265396118164, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "learning_rate": 6.622707806905536e-05, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "loss": 0.502987802028656, + "step": 16740 + }, + { + "ce_loss": 0.1237458735704422, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "distill_loss": 0.16146957874298096, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "ref_ce_loss": 0.08447730541229248, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "loss": 0.22744502127170563, + "step": 16740 + }, + { + "ce_loss": 0.05142305791378021, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "distill_loss": 0.10567010194063187, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "ref_ce_loss": 0.051125843077898026, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "loss": 0.2630317211151123, + "step": 16740 + }, + { + "ce_loss": 0.017098814249038696, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "distill_loss": 0.16734573245048523, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "ref_ce_loss": 0.05436302348971367, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "loss": 0.22591489553451538, + "step": 16740 + }, + { + "ce_loss": 0.02939753420650959, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "distill_loss": 0.08773542940616608, + "epoch": 5.583722481654436, + "step": 16740 + }, + { + "epoch": 5.583722481654436, + "ref_ce_loss": 0.047614991664886475, + "step": 16740 + }, + { + "epoch": 5.5870580386924615, + "loss": 0.3427, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "grad_norm": 3.073615074157715, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "learning_rate": 6.605912699772657e-05, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "loss": 0.49910616874694824, + "step": 16750 + }, + { + "ce_loss": 0.11704373359680176, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "distill_loss": 0.16967318952083588, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "ref_ce_loss": 0.08763284236192703, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "loss": 0.39135798811912537, + "step": 16750 + }, + { + "ce_loss": 0.12653915584087372, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "distill_loss": 0.1373661309480667, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "ref_ce_loss": 0.11389608681201935, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "loss": 0.22781170904636383, + "step": 16750 + }, + { + "ce_loss": 0.044658780097961426, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "distill_loss": 0.09752068668603897, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "ref_ce_loss": 0.07326694577932358, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "loss": 0.3132156729698181, + "step": 16750 + }, + { + "ce_loss": 0.0768897607922554, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "distill_loss": 0.15624229609966278, + "epoch": 5.5870580386924615, + "step": 16750 + }, + { + "epoch": 5.5870580386924615, + "ref_ce_loss": 0.06274968385696411, + "step": 16750 + }, + { + "epoch": 5.590393595730487, + "loss": 0.4097, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "grad_norm": 2.408324956893921, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "learning_rate": 6.589132900143807e-05, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "loss": 0.26510536670684814, + "step": 16760 + }, + { + "ce_loss": 0.0586848221719265, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "distill_loss": 0.09962846338748932, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "ref_ce_loss": 0.07298450917005539, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "loss": 0.4885880947113037, + "step": 16760 + }, + { + "ce_loss": 0.1015976145863533, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "distill_loss": 0.21036067605018616, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "ref_ce_loss": 0.058434370905160904, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "loss": 0.2753802537918091, + "step": 16760 + }, + { + "ce_loss": 0.05114470794796944, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "distill_loss": 0.12594880163669586, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "ref_ce_loss": 0.061987679451704025, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "loss": 0.17079053819179535, + "step": 16760 + }, + { + "ce_loss": 0.004621799103915691, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "distill_loss": 0.09937524795532227, + "epoch": 5.590393595730487, + "step": 16760 + }, + { + "epoch": 5.590393595730487, + "ref_ce_loss": 0.046160973608493805, + "step": 16760 + }, + { + "epoch": 5.593729152768512, + "loss": 0.3805, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "grad_norm": 2.079479694366455, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "learning_rate": 6.572368438618734e-05, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "loss": 0.5323101878166199, + "step": 16770 + }, + { + "ce_loss": 0.12786847352981567, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "distill_loss": 0.16538603603839874, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "ref_ce_loss": 0.09879563003778458, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "loss": 0.5461047291755676, + "step": 16770 + }, + { + "ce_loss": 0.12018280476331711, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "distill_loss": 0.13303323090076447, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "ref_ce_loss": 0.08113959431648254, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "loss": 0.26084309816360474, + "step": 16770 + }, + { + "ce_loss": 0.04693206027150154, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "distill_loss": 0.12489647418260574, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "ref_ce_loss": 0.0498616062104702, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "loss": 0.6751585602760315, + "step": 16770 + }, + { + "ce_loss": 0.09317155182361603, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "distill_loss": 0.13803917169570923, + "epoch": 5.593729152768512, + "step": 16770 + }, + { + "epoch": 5.593729152768512, + "ref_ce_loss": 0.10896999388933182, + "step": 16770 + }, + { + "epoch": 5.597064709806538, + "loss": 0.3811, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "grad_norm": 3.617305278778076, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "learning_rate": 6.555619345769205e-05, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "loss": 0.27140775322914124, + "step": 16780 + }, + { + "ce_loss": 0.09383071959018707, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "distill_loss": 0.08340008556842804, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "ref_ce_loss": 0.07747329771518707, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "loss": 0.31209468841552734, + "step": 16780 + }, + { + "ce_loss": 0.04330824688076973, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "distill_loss": 0.13244518637657166, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "ref_ce_loss": 0.0734424814581871, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "loss": 0.451698362827301, + "step": 16780 + }, + { + "ce_loss": 0.024857228621840477, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "distill_loss": 0.16980913281440735, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "ref_ce_loss": 0.08439881354570389, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "loss": 0.3579801321029663, + "step": 16780 + }, + { + "ce_loss": 0.09500003606081009, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "distill_loss": 0.1582847535610199, + "epoch": 5.597064709806538, + "step": 16780 + }, + { + "epoch": 5.597064709806538, + "ref_ce_loss": 0.08331548422574997, + "step": 16780 + }, + { + "epoch": 5.600400266844563, + "loss": 0.374, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "grad_norm": 2.767927646636963, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "learning_rate": 6.53888565213895e-05, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "loss": 0.5215246677398682, + "step": 16790 + }, + { + "ce_loss": 0.09143102169036865, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "distill_loss": 0.16542060673236847, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "ref_ce_loss": 0.07400554418563843, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "loss": 0.38477057218551636, + "step": 16790 + }, + { + "ce_loss": 0.07713790237903595, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "distill_loss": 0.1572268009185791, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "ref_ce_loss": 0.07512886077165604, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "loss": 0.8082589507102966, + "step": 16790 + }, + { + "ce_loss": 0.10924425721168518, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "distill_loss": 0.21931283175945282, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "ref_ce_loss": 0.08828362822532654, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "loss": 0.48302650451660156, + "step": 16790 + }, + { + "ce_loss": 0.08925081789493561, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "distill_loss": 0.14999093115329742, + "epoch": 5.600400266844563, + "step": 16790 + }, + { + "epoch": 5.600400266844563, + "ref_ce_loss": 0.12079279124736786, + "step": 16790 + }, + { + "epoch": 5.603735823882588, + "loss": 0.3433, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "grad_norm": 2.1402974128723145, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "learning_rate": 6.522167388243632e-05, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "loss": 0.22286638617515564, + "step": 16800 + }, + { + "ce_loss": 0.005005288403481245, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "distill_loss": 0.12163923680782318, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "ref_ce_loss": 0.06334011256694794, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "loss": 0.4707980751991272, + "step": 16800 + }, + { + "ce_loss": 0.08313055336475372, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "distill_loss": 0.12003469467163086, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "ref_ce_loss": 0.07313154637813568, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "loss": 0.2341451197862625, + "step": 16800 + }, + { + "ce_loss": 0.0286604855209589, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "distill_loss": 0.14671389758586884, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "ref_ce_loss": 0.0586368553340435, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "loss": 0.2169308066368103, + "step": 16800 + }, + { + "ce_loss": 0.055694498121738434, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "distill_loss": 0.09109349548816681, + "epoch": 5.603735823882588, + "step": 16800 + }, + { + "epoch": 5.603735823882588, + "ref_ce_loss": 0.05517904832959175, + "step": 16800 + }, + { + "epoch": 5.607071380920614, + "loss": 0.3733, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "grad_norm": 4.999337673187256, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "learning_rate": 6.50546458457076e-05, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "loss": 0.3535381257534027, + "step": 16810 + }, + { + "ce_loss": 0.08812938630580902, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "distill_loss": 0.11784398555755615, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "ref_ce_loss": 0.08096872270107269, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "loss": 0.45187708735466003, + "step": 16810 + }, + { + "ce_loss": 0.029312031343579292, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "distill_loss": 0.10933209210634232, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "ref_ce_loss": 0.08249694108963013, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "loss": 0.32924607396125793, + "step": 16810 + }, + { + "ce_loss": 0.06451299041509628, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "distill_loss": 0.1427079141139984, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "ref_ce_loss": 0.08778007328510284, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "loss": 0.42343294620513916, + "step": 16810 + }, + { + "ce_loss": 0.14571364223957062, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "distill_loss": 0.2002367526292801, + "epoch": 5.607071380920614, + "step": 16810 + }, + { + "epoch": 5.607071380920614, + "ref_ce_loss": 0.07713088393211365, + "step": 16810 + }, + { + "epoch": 5.610406937958639, + "loss": 0.3416, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "grad_norm": 1.9727840423583984, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "learning_rate": 6.48877727157966e-05, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "loss": 0.19755208492279053, + "step": 16820 + }, + { + "ce_loss": 0.021219512447714806, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "distill_loss": 0.11275633424520493, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "ref_ce_loss": 0.06317698955535889, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "loss": 0.5142295360565186, + "step": 16820 + }, + { + "ce_loss": 0.06793922185897827, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "distill_loss": 0.2093472182750702, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "ref_ce_loss": 0.0721978098154068, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "loss": 0.45053979754447937, + "step": 16820 + }, + { + "ce_loss": 0.07777457684278488, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "distill_loss": 0.1466236710548401, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "ref_ce_loss": 0.08509445935487747, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "loss": 0.2358899563550949, + "step": 16820 + }, + { + "ce_loss": 0.04887387901544571, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "distill_loss": 0.12546847760677338, + "epoch": 5.610406937958639, + "step": 16820 + }, + { + "epoch": 5.610406937958639, + "ref_ce_loss": 0.06151670962572098, + "step": 16820 + }, + { + "epoch": 5.613742494996664, + "loss": 0.3501, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "grad_norm": 2.5225648880004883, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "learning_rate": 6.472105479701425e-05, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "loss": 0.31352949142456055, + "step": 16830 + }, + { + "ce_loss": 0.10307907313108444, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "distill_loss": 0.13297227025032043, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "ref_ce_loss": 0.07737308740615845, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "loss": 0.2373771369457245, + "step": 16830 + }, + { + "ce_loss": 0.03195342794060707, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "distill_loss": 0.12204240262508392, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "ref_ce_loss": 0.0517796166241169, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "loss": 0.38204991817474365, + "step": 16830 + }, + { + "ce_loss": 0.05168701708316803, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "distill_loss": 0.18729457259178162, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "ref_ce_loss": 0.0903494581580162, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "loss": 0.5538869500160217, + "step": 16830 + }, + { + "ce_loss": 0.10379138588905334, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "distill_loss": 0.24181246757507324, + "epoch": 5.613742494996664, + "step": 16830 + }, + { + "epoch": 5.613742494996664, + "ref_ce_loss": 0.05665380507707596, + "step": 16830 + }, + { + "epoch": 5.61707805203469, + "loss": 0.3983, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "grad_norm": 6.060856819152832, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "learning_rate": 6.455449239338809e-05, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "loss": 0.6989403963088989, + "step": 16840 + }, + { + "ce_loss": 0.10969628393650055, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "distill_loss": 0.29851293563842773, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "ref_ce_loss": 0.06750264763832092, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "loss": 0.31590399146080017, + "step": 16840 + }, + { + "ce_loss": 0.12308245897293091, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "distill_loss": 0.11415868997573853, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "ref_ce_loss": 0.05798859894275665, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "loss": 0.27054455876350403, + "step": 16840 + }, + { + "ce_loss": 0.050939805805683136, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "distill_loss": 0.16063706576824188, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "ref_ce_loss": 0.05888758972287178, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "loss": 0.2523441016674042, + "step": 16840 + }, + { + "ce_loss": 0.038503505289554596, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "distill_loss": 0.1113259419798851, + "epoch": 5.61707805203469, + "step": 16840 + }, + { + "epoch": 5.61707805203469, + "ref_ce_loss": 0.08058229088783264, + "step": 16840 + }, + { + "epoch": 5.620413609072715, + "loss": 0.3615, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "grad_norm": 3.9339704513549805, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "learning_rate": 6.43880858086623e-05, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "loss": 0.31341302394866943, + "step": 16850 + }, + { + "ce_loss": 0.06522848457098007, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "distill_loss": 0.10614801943302155, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "ref_ce_loss": 0.07481067627668381, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "loss": 0.37363332509994507, + "step": 16850 + }, + { + "ce_loss": 0.0904751867055893, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "distill_loss": 0.20257927477359772, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "ref_ce_loss": 0.0803053081035614, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "loss": 0.9973900318145752, + "step": 16850 + }, + { + "ce_loss": 0.1294299215078354, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "distill_loss": 0.2549104690551758, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "ref_ce_loss": 0.1427856683731079, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "loss": 0.27538955211639404, + "step": 16850 + }, + { + "ce_loss": 0.03127444162964821, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "distill_loss": 0.11600720882415771, + "epoch": 5.620413609072715, + "step": 16850 + }, + { + "epoch": 5.620413609072715, + "ref_ce_loss": 0.053942952305078506, + "step": 16850 + }, + { + "epoch": 5.62374916611074, + "loss": 0.376, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "grad_norm": 1.79304039478302, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "learning_rate": 6.42218353462969e-05, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "loss": 0.5045366287231445, + "step": 16860 + }, + { + "ce_loss": 0.07572513073682785, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "distill_loss": 0.14832082390785217, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "ref_ce_loss": 0.0814862921833992, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "loss": 0.3642059564590454, + "step": 16860 + }, + { + "ce_loss": 0.05447271838784218, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "distill_loss": 0.15132412314414978, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "ref_ce_loss": 0.07785908877849579, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "loss": 0.1737252026796341, + "step": 16860 + }, + { + "ce_loss": 0.006255322601646185, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "distill_loss": 0.07501223683357239, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "ref_ce_loss": 0.05216903239488602, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "loss": 0.4993632137775421, + "step": 16860 + }, + { + "ce_loss": 0.09058297425508499, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "distill_loss": 0.10913488268852234, + "epoch": 5.62374916611074, + "step": 16860 + }, + { + "epoch": 5.62374916611074, + "ref_ce_loss": 0.07340264320373535, + "step": 16860 + }, + { + "epoch": 5.627084723148766, + "loss": 0.3919, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "grad_norm": 1.8264517784118652, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "learning_rate": 6.405574130946707e-05, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "loss": 0.30188092589378357, + "step": 16870 + }, + { + "ce_loss": 0.08277290314435959, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "distill_loss": 0.17446166276931763, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "ref_ce_loss": 0.04445138946175575, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "loss": 0.32595294713974, + "step": 16870 + }, + { + "ce_loss": 0.07430987805128098, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "distill_loss": 0.15796136856079102, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "ref_ce_loss": 0.07466117292642593, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "loss": 0.3349049687385559, + "step": 16870 + }, + { + "ce_loss": 0.0747389942407608, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "distill_loss": 0.1390739381313324, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "ref_ce_loss": 0.12086381763219833, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "loss": 0.3198974132537842, + "step": 16870 + }, + { + "ce_loss": 0.0835222452878952, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "distill_loss": 0.14694853127002716, + "epoch": 5.627084723148766, + "step": 16870 + }, + { + "epoch": 5.627084723148766, + "ref_ce_loss": 0.06452935934066772, + "step": 16870 + }, + { + "epoch": 5.630420280186791, + "loss": 0.3441, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "grad_norm": 5.504924774169922, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "learning_rate": 6.388980400106283e-05, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "loss": 0.4352351129055023, + "step": 16880 + }, + { + "ce_loss": 0.07789971679449081, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "distill_loss": 0.13848719000816345, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "ref_ce_loss": 0.12263132631778717, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "loss": 0.43858495354652405, + "step": 16880 + }, + { + "ce_loss": 0.11291355639696121, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "distill_loss": 0.14818082749843597, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "ref_ce_loss": 0.06953940540552139, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "loss": 0.2217913717031479, + "step": 16880 + }, + { + "ce_loss": 0.040353529155254364, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "distill_loss": 0.12617231905460358, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "ref_ce_loss": 0.05498276278376579, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "loss": 0.28420495986938477, + "step": 16880 + }, + { + "ce_loss": 0.06727323681116104, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "distill_loss": 0.11587470769882202, + "epoch": 5.630420280186791, + "step": 16880 + }, + { + "epoch": 5.630420280186791, + "ref_ce_loss": 0.05712687596678734, + "step": 16880 + }, + { + "epoch": 5.633755837224816, + "loss": 0.3626, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "grad_norm": 3.8169634342193604, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "learning_rate": 6.37240237236884e-05, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "loss": 0.34174394607543945, + "step": 16890 + }, + { + "ce_loss": 0.06275150179862976, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "distill_loss": 0.15911883115768433, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "ref_ce_loss": 0.08189556747674942, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "loss": 0.2643011510372162, + "step": 16890 + }, + { + "ce_loss": 0.06280254572629929, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "distill_loss": 0.11299487203359604, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "ref_ce_loss": 0.05963292345404625, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "loss": 0.3587222397327423, + "step": 16890 + }, + { + "ce_loss": 0.11320937424898148, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "distill_loss": 0.16261257231235504, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "ref_ce_loss": 0.08281068503856659, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "loss": 0.23671752214431763, + "step": 16890 + }, + { + "ce_loss": 0.03498915210366249, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "distill_loss": 0.10112226009368896, + "epoch": 5.633755837224816, + "step": 16890 + }, + { + "epoch": 5.633755837224816, + "ref_ce_loss": 0.06465116888284683, + "step": 16890 + }, + { + "epoch": 5.637091394262842, + "loss": 0.3735, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "grad_norm": 2.1169912815093994, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "learning_rate": 6.355840077966158e-05, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "loss": 0.23858742415905, + "step": 16900 + }, + { + "ce_loss": 0.041046276688575745, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "distill_loss": 0.07950280606746674, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "ref_ce_loss": 0.072368323802948, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "loss": 0.28376126289367676, + "step": 16900 + }, + { + "ce_loss": 0.02801305614411831, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "distill_loss": 0.12259427458047867, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "ref_ce_loss": 0.07551668584346771, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "loss": 0.18908122181892395, + "step": 16900 + }, + { + "ce_loss": 0.032690152525901794, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "distill_loss": 0.0932382196187973, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "ref_ce_loss": 0.06297314912080765, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "loss": 0.3903302252292633, + "step": 16900 + }, + { + "ce_loss": 0.09799889475107193, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "distill_loss": 0.1463698148727417, + "epoch": 5.637091394262842, + "step": 16900 + }, + { + "epoch": 5.637091394262842, + "ref_ce_loss": 0.07008067518472672, + "step": 16900 + }, + { + "epoch": 5.640426951300867, + "loss": 0.389, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "grad_norm": 3.601564884185791, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "learning_rate": 6.339293547101339e-05, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "loss": 0.6289790272712708, + "step": 16910 + }, + { + "ce_loss": 0.05350485444068909, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "distill_loss": 0.3808191120624542, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "ref_ce_loss": 0.09556426852941513, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "loss": 0.3073519468307495, + "step": 16910 + }, + { + "ce_loss": 0.012231869623064995, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "distill_loss": 0.10754086822271347, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "ref_ce_loss": 0.07653357833623886, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "loss": 0.4514399766921997, + "step": 16910 + }, + { + "ce_loss": 0.10036049783229828, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "distill_loss": 0.14256389439105988, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "ref_ce_loss": 0.09835103154182434, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "loss": 0.3855137228965759, + "step": 16910 + }, + { + "ce_loss": 0.12630429863929749, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "distill_loss": 0.15595805644989014, + "epoch": 5.640426951300867, + "step": 16910 + }, + { + "epoch": 5.640426951300867, + "ref_ce_loss": 0.0788608267903328, + "step": 16910 + }, + { + "epoch": 5.6437625083388925, + "loss": 0.3837, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "grad_norm": 1.9282772541046143, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "learning_rate": 6.322762809948714e-05, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "loss": 0.3533139228820801, + "step": 16920 + }, + { + "ce_loss": 0.12137754261493683, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "distill_loss": 0.14143449068069458, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "ref_ce_loss": 0.07143258303403854, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "loss": 0.31891804933547974, + "step": 16920 + }, + { + "ce_loss": 0.07180596888065338, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "distill_loss": 0.18838739395141602, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "ref_ce_loss": 0.05863086134195328, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "loss": 0.4280124306678772, + "step": 16920 + }, + { + "ce_loss": 0.06295457482337952, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "distill_loss": 0.1645834743976593, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "ref_ce_loss": 0.06780682504177094, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "loss": 0.31792157888412476, + "step": 16920 + }, + { + "ce_loss": 0.08119270205497742, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "distill_loss": 0.10686053335666656, + "epoch": 5.6437625083388925, + "step": 16920 + }, + { + "epoch": 5.6437625083388925, + "ref_ce_loss": 0.08020225912332535, + "step": 16920 + }, + { + "epoch": 5.647098065376918, + "loss": 0.4228, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "grad_norm": 2.3339974880218506, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "learning_rate": 6.306247896653833e-05, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "loss": 0.3029731512069702, + "step": 16930 + }, + { + "ce_loss": 0.024777529761195183, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "distill_loss": 0.16923952102661133, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "ref_ce_loss": 0.04961796849966049, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "loss": 0.29297712445259094, + "step": 16930 + }, + { + "ce_loss": 0.08377854526042938, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "distill_loss": 0.12032924592494965, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "ref_ce_loss": 0.06323269009590149, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "loss": 0.40155941247940063, + "step": 16930 + }, + { + "ce_loss": 0.08552870154380798, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "distill_loss": 0.1203310415148735, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "ref_ce_loss": 0.1074351966381073, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "loss": 0.3517897129058838, + "step": 16930 + }, + { + "ce_loss": 0.05377979949116707, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "distill_loss": 0.1443416327238083, + "epoch": 5.647098065376918, + "step": 16930 + }, + { + "epoch": 5.647098065376918, + "ref_ce_loss": 0.08523765951395035, + "step": 16930 + }, + { + "epoch": 5.650433622414943, + "loss": 0.3688, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "grad_norm": 2.552321434020996, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "learning_rate": 6.289748837333383e-05, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "loss": 0.43256378173828125, + "step": 16940 + }, + { + "ce_loss": 0.07758132368326187, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "distill_loss": 0.11776245385408401, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "ref_ce_loss": 0.08114629238843918, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "loss": 0.4100405275821686, + "step": 16940 + }, + { + "ce_loss": 0.11772602796554565, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "distill_loss": 0.1447276473045349, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "ref_ce_loss": 0.11454854905605316, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "loss": 0.4397696256637573, + "step": 16940 + }, + { + "ce_loss": 0.1683981567621231, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "distill_loss": 0.157260924577713, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "ref_ce_loss": 0.08736620843410492, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "loss": 0.38817211985588074, + "step": 16940 + }, + { + "ce_loss": 0.0835593119263649, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "distill_loss": 0.13545547425746918, + "epoch": 5.650433622414943, + "step": 16940 + }, + { + "epoch": 5.650433622414943, + "ref_ce_loss": 0.08349437266588211, + "step": 16940 + }, + { + "epoch": 5.6537691794529685, + "loss": 0.385, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "grad_norm": 3.4251179695129395, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "learning_rate": 6.273265662075142e-05, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "loss": 0.23143433034420013, + "step": 16950 + }, + { + "ce_loss": 0.05177433043718338, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "distill_loss": 0.08931882679462433, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "ref_ce_loss": 0.06682384759187698, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "loss": 0.4629412293434143, + "step": 16950 + }, + { + "ce_loss": 0.11971709877252579, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "distill_loss": 0.1258133500814438, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "ref_ce_loss": 0.07613131403923035, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "loss": 0.2683376669883728, + "step": 16950 + }, + { + "ce_loss": 0.05023961886763573, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "distill_loss": 0.13034687936306, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "ref_ce_loss": 0.05619325861334801, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "loss": 0.2849885821342468, + "step": 16950 + }, + { + "ce_loss": 0.07378362864255905, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "distill_loss": 0.1290549486875534, + "epoch": 5.6537691794529685, + "step": 16950 + }, + { + "epoch": 5.6537691794529685, + "ref_ce_loss": 0.081927590072155, + "step": 16950 + }, + { + "epoch": 5.657104736490994, + "loss": 0.3898, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "grad_norm": 5.112725734710693, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "learning_rate": 6.256798400937919e-05, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "loss": 0.306372731924057, + "step": 16960 + }, + { + "ce_loss": 0.05546395480632782, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "distill_loss": 0.11730388551950455, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "ref_ce_loss": 0.08945717662572861, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "loss": 0.35184144973754883, + "step": 16960 + }, + { + "ce_loss": 0.12297777831554413, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "distill_loss": 0.13996382057666779, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "ref_ce_loss": 0.07640225440263748, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "loss": 0.4762941002845764, + "step": 16960 + }, + { + "ce_loss": 0.1554713249206543, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "distill_loss": 0.1667623519897461, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "ref_ce_loss": 0.07829466462135315, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "loss": 0.557092547416687, + "step": 16960 + }, + { + "ce_loss": 0.07773231714963913, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "distill_loss": 0.09895443171262741, + "epoch": 5.657104736490994, + "step": 16960 + }, + { + "epoch": 5.657104736490994, + "ref_ce_loss": 0.05536096170544624, + "step": 16960 + }, + { + "epoch": 5.660440293529019, + "loss": 0.4094, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "grad_norm": 3.074315071105957, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "learning_rate": 6.240347083951498e-05, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "loss": 0.3062307834625244, + "step": 16970 + }, + { + "ce_loss": 0.04686376824975014, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "distill_loss": 0.12231691926717758, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "ref_ce_loss": 0.06324928253889084, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "loss": 0.38145822286605835, + "step": 16970 + }, + { + "ce_loss": 0.07497313618659973, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "distill_loss": 0.10484956949949265, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "ref_ce_loss": 0.0924435704946518, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "loss": 0.25619828701019287, + "step": 16970 + }, + { + "ce_loss": 0.03664708137512207, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "distill_loss": 0.10653015226125717, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "ref_ce_loss": 0.06614420562982559, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "loss": 0.21806739270687103, + "step": 16970 + }, + { + "ce_loss": 0.04098616912961006, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "distill_loss": 0.10077305883169174, + "epoch": 5.660440293529019, + "step": 16970 + }, + { + "epoch": 5.660440293529019, + "ref_ce_loss": 0.05704665184020996, + "step": 16970 + }, + { + "epoch": 5.663775850567045, + "loss": 0.3637, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "grad_norm": 1.8063230514526367, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "learning_rate": 6.223911741116595e-05, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "loss": 0.6903234720230103, + "step": 16980 + }, + { + "ce_loss": 0.11006470024585724, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "distill_loss": 0.14529787003993988, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "ref_ce_loss": 0.10468210279941559, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "loss": 0.349357932806015, + "step": 16980 + }, + { + "ce_loss": 0.056875284761190414, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "distill_loss": 0.17770813405513763, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "ref_ce_loss": 0.0949898213148117, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "loss": 0.42283138632774353, + "step": 16980 + }, + { + "ce_loss": 0.12381055951118469, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "distill_loss": 0.18214209377765656, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "ref_ce_loss": 0.1167239099740982, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "loss": 0.23594720661640167, + "step": 16980 + }, + { + "ce_loss": 0.04936056211590767, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "distill_loss": 0.12174536287784576, + "epoch": 5.663775850567045, + "step": 16980 + }, + { + "epoch": 5.663775850567045, + "ref_ce_loss": 0.0527084581553936, + "step": 16980 + }, + { + "epoch": 5.66711140760507, + "loss": 0.3661, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "grad_norm": 3.128441572189331, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "learning_rate": 6.207492402404793e-05, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "loss": 0.24315060675144196, + "step": 16990 + }, + { + "ce_loss": 0.026851223781704903, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "distill_loss": 0.09447737038135529, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "ref_ce_loss": 0.0893881618976593, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "loss": 0.3972874581813812, + "step": 16990 + }, + { + "ce_loss": 0.09030399471521378, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "distill_loss": 0.16162776947021484, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "ref_ce_loss": 0.08049967885017395, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "loss": 0.33712124824523926, + "step": 16990 + }, + { + "ce_loss": 0.1013912633061409, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "distill_loss": 0.13497503101825714, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "ref_ce_loss": 0.08126402646303177, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "loss": 0.28021666407585144, + "step": 16990 + }, + { + "ce_loss": 0.1139599084854126, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "distill_loss": 0.10009388625621796, + "epoch": 5.66711140760507, + "step": 16990 + }, + { + "epoch": 5.66711140760507, + "ref_ce_loss": 0.06535064429044724, + "step": 16990 + }, + { + "epoch": 5.670446964643095, + "loss": 0.3448, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "grad_norm": 2.8193626403808594, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "learning_rate": 6.191089097758485e-05, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "loss": 0.3630388379096985, + "step": 17000 + }, + { + "ce_loss": 0.04292258620262146, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "distill_loss": 0.17938470840454102, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "ref_ce_loss": 0.06609540432691574, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "loss": 0.26592352986335754, + "step": 17000 + }, + { + "ce_loss": 0.05733026936650276, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "distill_loss": 0.1041751280426979, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "ref_ce_loss": 0.08246053755283356, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "loss": 0.30712056159973145, + "step": 17000 + }, + { + "ce_loss": 0.08909639716148376, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "distill_loss": 0.15928807854652405, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "ref_ce_loss": 0.058509256690740585, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "loss": 0.3685610294342041, + "step": 17000 + }, + { + "ce_loss": 0.0719846785068512, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "distill_loss": 0.19032107293605804, + "epoch": 5.670446964643095, + "step": 17000 + }, + { + "epoch": 5.670446964643095, + "ref_ce_loss": 0.06409589946269989, + "step": 17000 + }, + { + "epoch": 5.673782521681121, + "loss": 0.3253, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "grad_norm": 3.6958439350128174, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "learning_rate": 6.174701857090838e-05, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "loss": 0.3091316521167755, + "step": 17010 + }, + { + "ce_loss": 0.05044399946928024, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "distill_loss": 0.11642865091562271, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "ref_ce_loss": 0.10243427008390427, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "loss": 0.3319244384765625, + "step": 17010 + }, + { + "ce_loss": 0.05720686912536621, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "distill_loss": 0.1569894701242447, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "ref_ce_loss": 0.08333617448806763, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "loss": 0.3339140713214874, + "step": 17010 + }, + { + "ce_loss": 0.05138056352734566, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "distill_loss": 0.1706089973449707, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "ref_ce_loss": 0.06299076974391937, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "loss": 0.7556455135345459, + "step": 17010 + }, + { + "ce_loss": 0.10137326270341873, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "distill_loss": 0.14703679084777832, + "epoch": 5.673782521681121, + "step": 17010 + }, + { + "epoch": 5.673782521681121, + "ref_ce_loss": 0.08514633029699326, + "step": 17010 + }, + { + "epoch": 5.677118078719146, + "loss": 0.3976, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "grad_norm": 5.015145778656006, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "learning_rate": 6.158330710285702e-05, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "loss": 0.2708377242088318, + "step": 17020 + }, + { + "ce_loss": 0.03134353458881378, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "distill_loss": 0.09554773569107056, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "ref_ce_loss": 0.07234134525060654, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "loss": 0.22705619037151337, + "step": 17020 + }, + { + "ce_loss": 0.024353666231036186, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "distill_loss": 0.1192890852689743, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "ref_ce_loss": 0.08326216042041779, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "loss": 0.302390992641449, + "step": 17020 + }, + { + "ce_loss": 0.06740739196538925, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "distill_loss": 0.10257144272327423, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "ref_ce_loss": 0.10527370125055313, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "loss": 0.4007715880870819, + "step": 17020 + }, + { + "ce_loss": 0.10588482767343521, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "distill_loss": 0.22049292922019958, + "epoch": 5.677118078719146, + "step": 17020 + }, + { + "epoch": 5.677118078719146, + "ref_ce_loss": 0.07412932068109512, + "step": 17020 + }, + { + "epoch": 5.680453635757171, + "loss": 0.3595, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "grad_norm": 2.110219717025757, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "learning_rate": 6.141975687197596e-05, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "loss": 0.9901852607727051, + "step": 17030 + }, + { + "ce_loss": 0.06740203499794006, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "distill_loss": 0.14835187792778015, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "ref_ce_loss": 0.048046503216028214, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "loss": 1.149698257446289, + "step": 17030 + }, + { + "ce_loss": 0.09891572594642639, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "distill_loss": 0.12695176899433136, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "ref_ce_loss": 0.07718627154827118, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "loss": 0.485831618309021, + "step": 17030 + }, + { + "ce_loss": 0.1584722250699997, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "distill_loss": 0.1864461898803711, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "ref_ce_loss": 0.11007975041866302, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "loss": 0.3302626311779022, + "step": 17030 + }, + { + "ce_loss": 0.10280175507068634, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "distill_loss": 0.124929279088974, + "epoch": 5.680453635757171, + "step": 17030 + }, + { + "epoch": 5.680453635757171, + "ref_ce_loss": 0.07476763427257538, + "step": 17030 + }, + { + "epoch": 5.683789192795197, + "loss": 0.3904, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "grad_norm": 2.1810994148254395, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "learning_rate": 6.125636817651632e-05, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "loss": 0.22979770600795746, + "step": 17040 + }, + { + "ce_loss": 0.04653044044971466, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "distill_loss": 0.13956502079963684, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "ref_ce_loss": 0.04300351068377495, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "loss": 0.2592867314815521, + "step": 17040 + }, + { + "ce_loss": 0.06638824939727783, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "distill_loss": 0.12841610610485077, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "ref_ce_loss": 0.06432758271694183, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "loss": 0.5075886249542236, + "step": 17040 + }, + { + "ce_loss": 0.11375562846660614, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "distill_loss": 0.16561579704284668, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "ref_ce_loss": 0.08213319629430771, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "loss": 0.5379311442375183, + "step": 17040 + }, + { + "ce_loss": 0.1698356419801712, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "distill_loss": 0.1929055154323578, + "epoch": 5.683789192795197, + "step": 17040 + }, + { + "epoch": 5.683789192795197, + "ref_ce_loss": 0.09995917230844498, + "step": 17040 + }, + { + "epoch": 5.687124749833222, + "loss": 0.3788, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "grad_norm": 3.6914074420928955, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "learning_rate": 6.109314131443462e-05, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "loss": 0.30641597509384155, + "step": 17050 + }, + { + "ce_loss": 0.06863492727279663, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "distill_loss": 0.12574729323387146, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "ref_ce_loss": 0.04029463976621628, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "loss": 0.2634451389312744, + "step": 17050 + }, + { + "ce_loss": 0.0588807575404644, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "distill_loss": 0.11593613773584366, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "ref_ce_loss": 0.08846764266490936, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "loss": 0.3977257311344147, + "step": 17050 + }, + { + "ce_loss": 0.14133258163928986, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "distill_loss": 0.14880916476249695, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "ref_ce_loss": 0.09101521223783493, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "loss": 0.49362966418266296, + "step": 17050 + }, + { + "ce_loss": 0.05905555561184883, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "distill_loss": 0.14283868670463562, + "epoch": 5.687124749833222, + "step": 17050 + }, + { + "epoch": 5.687124749833222, + "ref_ce_loss": 0.06556601077318192, + "step": 17050 + }, + { + "epoch": 5.690460306871247, + "loss": 0.3621, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "grad_norm": 2.039789915084839, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "learning_rate": 6.0930076583392305e-05, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "loss": 0.3623387813568115, + "step": 17060 + }, + { + "ce_loss": 0.06750385463237762, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "distill_loss": 0.10620886087417603, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "ref_ce_loss": 0.06084444001317024, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "loss": 0.2275630533695221, + "step": 17060 + }, + { + "ce_loss": 0.04591386765241623, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "distill_loss": 0.10855705291032791, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "ref_ce_loss": 0.0493580587208271, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "loss": 0.2691376805305481, + "step": 17060 + }, + { + "ce_loss": 0.03460320085287094, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "distill_loss": 0.10596905648708344, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "ref_ce_loss": 0.06285975873470306, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "loss": 0.32650092244148254, + "step": 17060 + }, + { + "ce_loss": 0.06436863541603088, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "distill_loss": 0.1277579665184021, + "epoch": 5.690460306871247, + "step": 17060 + }, + { + "epoch": 5.690460306871247, + "ref_ce_loss": 0.09064862132072449, + "step": 17060 + }, + { + "epoch": 5.693795863909273, + "loss": 0.3159, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "grad_norm": 2.3171262741088867, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "learning_rate": 6.076717428075505e-05, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "loss": 0.2595212161540985, + "step": 17070 + }, + { + "ce_loss": 0.06113705039024353, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "distill_loss": 0.11590202152729034, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "ref_ce_loss": 0.0593080148100853, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "loss": 0.5655686855316162, + "step": 17070 + }, + { + "ce_loss": 0.05499016121029854, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "distill_loss": 0.17960451543331146, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "ref_ce_loss": 0.06927433609962463, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "loss": 0.4495111107826233, + "step": 17070 + }, + { + "ce_loss": 0.12065106630325317, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "distill_loss": 0.1322806179523468, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "ref_ce_loss": 0.12023723125457764, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "loss": 0.31495094299316406, + "step": 17070 + }, + { + "ce_loss": 0.07359825074672699, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "distill_loss": 0.11552683264017105, + "epoch": 5.693795863909273, + "step": 17070 + }, + { + "epoch": 5.693795863909273, + "ref_ce_loss": 0.09320148080587387, + "step": 17070 + }, + { + "epoch": 5.697131420947298, + "loss": 0.3576, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "grad_norm": 2.8441379070281982, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "learning_rate": 6.060443470359243e-05, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "loss": 0.18535450100898743, + "step": 17080 + }, + { + "ce_loss": 0.014140546321868896, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "distill_loss": 0.10060036182403564, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "ref_ce_loss": 0.05045443773269653, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "loss": 0.23556338250637054, + "step": 17080 + }, + { + "ce_loss": 0.0404030904173851, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "distill_loss": 0.11873391270637512, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "ref_ce_loss": 0.052613597363233566, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "loss": 0.3786413371562958, + "step": 17080 + }, + { + "ce_loss": 0.05384358391165733, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "distill_loss": 0.08683878928422928, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "ref_ce_loss": 0.07605697959661484, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "loss": 0.3993130326271057, + "step": 17080 + }, + { + "ce_loss": 0.05315512791275978, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "distill_loss": 0.1281694769859314, + "epoch": 5.697131420947298, + "step": 17080 + }, + { + "epoch": 5.697131420947298, + "ref_ce_loss": 0.07677015662193298, + "step": 17080 + }, + { + "epoch": 5.700466977985323, + "loss": 0.3881, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "grad_norm": 6.708038806915283, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "learning_rate": 6.0441858148677274e-05, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "loss": 0.4696926176548004, + "step": 17090 + }, + { + "ce_loss": 0.1001918688416481, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "distill_loss": 0.21125702559947968, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "ref_ce_loss": 0.05627693980932236, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "loss": 0.7283337116241455, + "step": 17090 + }, + { + "ce_loss": 0.1716901659965515, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "distill_loss": 0.171518474817276, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "ref_ce_loss": 0.05830475315451622, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "loss": 0.2751966714859009, + "step": 17090 + }, + { + "ce_loss": 0.039496466517448425, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "distill_loss": 0.14405354857444763, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "ref_ce_loss": 0.059192027896642685, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "loss": 0.28596171736717224, + "step": 17090 + }, + { + "ce_loss": 0.0590527206659317, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "distill_loss": 0.10830798745155334, + "epoch": 5.700466977985323, + "step": 17090 + }, + { + "epoch": 5.700466977985323, + "ref_ce_loss": 0.06655246019363403, + "step": 17090 + }, + { + "epoch": 5.703802535023349, + "loss": 0.3464, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "grad_norm": 3.195554256439209, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "learning_rate": 6.027944491248502e-05, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "loss": 0.4235726594924927, + "step": 17100 + }, + { + "ce_loss": 0.07542421668767929, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "distill_loss": 0.11276789754629135, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "ref_ce_loss": 0.05181581526994705, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "loss": 0.26524561643600464, + "step": 17100 + }, + { + "ce_loss": 0.021398169919848442, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "distill_loss": 0.12772291898727417, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "ref_ce_loss": 0.0794157013297081, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "loss": 0.5993280410766602, + "step": 17100 + }, + { + "ce_loss": 0.1594715267419815, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "distill_loss": 0.15677529573440552, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "ref_ce_loss": 0.11099915206432343, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "loss": 0.2803640365600586, + "step": 17100 + }, + { + "ce_loss": 0.039205145090818405, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "distill_loss": 0.11149227619171143, + "epoch": 5.703802535023349, + "step": 17100 + }, + { + "epoch": 5.703802535023349, + "ref_ce_loss": 0.09155328571796417, + "step": 17100 + }, + { + "epoch": 5.707138092061374, + "loss": 0.3497, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "grad_norm": 1.5642236471176147, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "learning_rate": 6.011719529119337e-05, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "loss": 0.3479704260826111, + "step": 17110 + }, + { + "ce_loss": 0.06622432172298431, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "distill_loss": 0.11067965626716614, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "ref_ce_loss": 0.09841424971818924, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "loss": 0.30340105295181274, + "step": 17110 + }, + { + "ce_loss": 0.0672907829284668, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "distill_loss": 0.12754839658737183, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "ref_ce_loss": 0.0743776187300682, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "loss": 0.4672259986400604, + "step": 17110 + }, + { + "ce_loss": 0.11600927263498306, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "distill_loss": 0.16114374995231628, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "ref_ce_loss": 0.08801127225160599, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "loss": 0.4922545254230499, + "step": 17110 + }, + { + "ce_loss": 0.1250251680612564, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "distill_loss": 0.16420485079288483, + "epoch": 5.707138092061374, + "step": 17110 + }, + { + "epoch": 5.707138092061374, + "ref_ce_loss": 0.09376829862594604, + "step": 17110 + }, + { + "epoch": 5.7104736490993995, + "loss": 0.37, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "grad_norm": 2.751373052597046, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "learning_rate": 5.995510958068162e-05, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "loss": 0.3050478398799896, + "step": 17120 + }, + { + "ce_loss": 0.07524151355028152, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "distill_loss": 0.14798586070537567, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "ref_ce_loss": 0.06977467238903046, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "loss": 0.2592675983905792, + "step": 17120 + }, + { + "ce_loss": 0.043805621564388275, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "distill_loss": 0.11673689633607864, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "ref_ce_loss": 0.07951228320598602, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "loss": 0.27545806765556335, + "step": 17120 + }, + { + "ce_loss": 0.02748233452439308, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "distill_loss": 0.09332962334156036, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "ref_ce_loss": 0.05149763822555542, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "loss": 0.22703437507152557, + "step": 17120 + }, + { + "ce_loss": 0.02626241184771061, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "distill_loss": 0.1262034773826599, + "epoch": 5.7104736490993995, + "step": 17120 + }, + { + "epoch": 5.7104736490993995, + "ref_ce_loss": 0.07425781339406967, + "step": 17120 + }, + { + "epoch": 5.713809206137425, + "loss": 0.311, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "grad_norm": 3.511167526245117, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "learning_rate": 5.979318807653019e-05, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "loss": 0.1948380023241043, + "step": 17130 + }, + { + "ce_loss": 0.04529014602303505, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "distill_loss": 0.1010599136352539, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "ref_ce_loss": 0.0482073649764061, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "loss": 0.23362573981285095, + "step": 17130 + }, + { + "ce_loss": 0.043237727135419846, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "distill_loss": 0.1262238621711731, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "ref_ce_loss": 0.0638972595334053, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "loss": 0.2678848206996918, + "step": 17130 + }, + { + "ce_loss": 0.03875793516635895, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "distill_loss": 0.1362183392047882, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "ref_ce_loss": 0.09272968024015427, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "loss": 0.28923311829566956, + "step": 17130 + }, + { + "ce_loss": 0.07208915799856186, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "distill_loss": 0.13289576768875122, + "epoch": 5.713809206137425, + "step": 17130 + }, + { + "epoch": 5.713809206137425, + "ref_ce_loss": 0.06253129988908768, + "step": 17130 + }, + { + "epoch": 5.71714476317545, + "loss": 0.33, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "grad_norm": 2.24771785736084, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "learning_rate": 5.963143107402007e-05, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "loss": 0.2893480956554413, + "step": 17140 + }, + { + "ce_loss": 0.06652390211820602, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "distill_loss": 0.14231842756271362, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "ref_ce_loss": 0.08033139258623123, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "loss": 0.40271690487861633, + "step": 17140 + }, + { + "ce_loss": 0.040531329810619354, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "distill_loss": 0.1344706416130066, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "ref_ce_loss": 0.10532485693693161, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "loss": 0.2142089456319809, + "step": 17140 + }, + { + "ce_loss": 0.03037523478269577, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "distill_loss": 0.09983956813812256, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "ref_ce_loss": 0.05833209678530693, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "loss": 0.4400641918182373, + "step": 17140 + }, + { + "ce_loss": 0.06064482778310776, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "distill_loss": 0.12813520431518555, + "epoch": 5.71714476317545, + "step": 17140 + }, + { + "epoch": 5.71714476317545, + "ref_ce_loss": 0.044088393449783325, + "step": 17140 + }, + { + "epoch": 5.7204803202134755, + "loss": 0.3816, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "grad_norm": 2.1068670749664307, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "learning_rate": 5.946983886813216e-05, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "loss": 0.3585885763168335, + "step": 17150 + }, + { + "ce_loss": 0.0682455524802208, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "distill_loss": 0.17169511318206787, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "ref_ce_loss": 0.08379658311605453, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "loss": 0.342392235994339, + "step": 17150 + }, + { + "ce_loss": 0.03174396976828575, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "distill_loss": 0.14371605217456818, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "ref_ce_loss": 0.09976787120103836, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "loss": 0.31579867005348206, + "step": 17150 + }, + { + "ce_loss": 0.06596563011407852, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "distill_loss": 0.12314214557409286, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "ref_ce_loss": 0.06553999334573746, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "loss": 0.615726113319397, + "step": 17150 + }, + { + "ce_loss": 0.1368461549282074, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "distill_loss": 0.14078648388385773, + "epoch": 5.7204803202134755, + "step": 17150 + }, + { + "epoch": 5.7204803202134755, + "ref_ce_loss": 0.07783728837966919, + "step": 17150 + }, + { + "epoch": 5.723815877251501, + "loss": 0.349, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "grad_norm": 1.7467983961105347, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "learning_rate": 5.930841175354689e-05, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "loss": 0.2381708174943924, + "step": 17160 + }, + { + "ce_loss": 0.041369061917066574, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "distill_loss": 0.1097210943698883, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "ref_ce_loss": 0.046333197504282, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "loss": 0.2818985879421234, + "step": 17160 + }, + { + "ce_loss": 0.05317312851548195, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "distill_loss": 0.13140282034873962, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "ref_ce_loss": 0.07650244235992432, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "loss": 0.2387668788433075, + "step": 17160 + }, + { + "ce_loss": 0.036102037876844406, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "distill_loss": 0.10024573653936386, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "ref_ce_loss": 0.06697791069746017, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "loss": 0.35411015152931213, + "step": 17160 + }, + { + "ce_loss": 0.12947840988636017, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "distill_loss": 0.11806891113519669, + "epoch": 5.723815877251501, + "step": 17160 + }, + { + "epoch": 5.723815877251501, + "ref_ce_loss": 0.09559057652950287, + "step": 17160 + }, + { + "epoch": 5.727151434289526, + "loss": 0.3435, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "grad_norm": 2.8510990142822266, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "learning_rate": 5.914715002464368e-05, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "loss": 0.376874178647995, + "step": 17170 + }, + { + "ce_loss": 0.10144776105880737, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "distill_loss": 0.10674238950014114, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "ref_ce_loss": 0.08582198619842529, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "loss": 0.4589444100856781, + "step": 17170 + }, + { + "ce_loss": 0.07079615443944931, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "distill_loss": 0.14036592841148376, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "ref_ce_loss": 0.09123189002275467, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "loss": 0.30794355273246765, + "step": 17170 + }, + { + "ce_loss": 0.05897359549999237, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "distill_loss": 0.10716037452220917, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "ref_ce_loss": 0.07225260138511658, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "loss": 0.24795758724212646, + "step": 17170 + }, + { + "ce_loss": 0.06725875288248062, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "distill_loss": 0.1040724664926529, + "epoch": 5.727151434289526, + "step": 17170 + }, + { + "epoch": 5.727151434289526, + "ref_ce_loss": 0.07633961737155914, + "step": 17170 + }, + { + "epoch": 5.730486991327552, + "loss": 0.3443, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "grad_norm": 3.2798383235931396, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "learning_rate": 5.8986053975500306e-05, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "loss": 0.30777508020401, + "step": 17180 + }, + { + "ce_loss": 0.06996277719736099, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "distill_loss": 0.13840298354625702, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "ref_ce_loss": 0.056659650057554245, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "loss": 0.29943349957466125, + "step": 17180 + }, + { + "ce_loss": 0.019586697220802307, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "distill_loss": 0.09618957340717316, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "ref_ce_loss": 0.05273442342877388, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "loss": 0.3717377781867981, + "step": 17180 + }, + { + "ce_loss": 0.14365200698375702, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "distill_loss": 0.15098531544208527, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "ref_ce_loss": 0.0676068514585495, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "loss": 0.3774576485157013, + "step": 17180 + }, + { + "ce_loss": 0.12183333188295364, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "distill_loss": 0.15439927577972412, + "epoch": 5.730486991327552, + "step": 17180 + }, + { + "epoch": 5.730486991327552, + "ref_ce_loss": 0.07511351257562637, + "step": 17180 + }, + { + "epoch": 5.733822548365577, + "loss": 0.3472, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "grad_norm": 3.112159490585327, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "learning_rate": 5.882512389989244e-05, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "loss": 0.4421817660331726, + "step": 17190 + }, + { + "ce_loss": 0.08099653571844101, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "distill_loss": 0.20904618501663208, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "ref_ce_loss": 0.05432404205203056, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "loss": 0.5608204007148743, + "step": 17190 + }, + { + "ce_loss": 0.09235455095767975, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "distill_loss": 0.20927207171916962, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "ref_ce_loss": 0.10277006030082703, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "loss": 0.2641429305076599, + "step": 17190 + }, + { + "ce_loss": 0.08527445793151855, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "distill_loss": 0.12051382660865784, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "ref_ce_loss": 0.044529445469379425, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "loss": 0.7164769172668457, + "step": 17190 + }, + { + "ce_loss": 0.11412998288869858, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "distill_loss": 0.16576185822486877, + "epoch": 5.733822548365577, + "step": 17190 + }, + { + "epoch": 5.733822548365577, + "ref_ce_loss": 0.1029839813709259, + "step": 17190 + }, + { + "epoch": 5.737158105403602, + "loss": 0.3407, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "grad_norm": 2.1736953258514404, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "learning_rate": 5.866436009129299e-05, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "loss": 0.40977779030799866, + "step": 17200 + }, + { + "ce_loss": 0.0977315753698349, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "distill_loss": 0.14270806312561035, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "ref_ce_loss": 0.04262208193540573, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "loss": 0.176670104265213, + "step": 17200 + }, + { + "ce_loss": 0.04305120185017586, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "distill_loss": 0.09481915831565857, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "ref_ce_loss": 0.038723744451999664, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "loss": 0.36046162247657776, + "step": 17200 + }, + { + "ce_loss": 0.05129627510905266, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "distill_loss": 0.1609860509634018, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "ref_ce_loss": 0.06556078791618347, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "loss": 0.329052209854126, + "step": 17200 + }, + { + "ce_loss": 0.023697247728705406, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "distill_loss": 0.20539245009422302, + "epoch": 5.737158105403602, + "step": 17200 + }, + { + "epoch": 5.737158105403602, + "ref_ce_loss": 0.06529682129621506, + "step": 17200 + }, + { + "epoch": 5.740493662441628, + "loss": 0.3736, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "grad_norm": 1.4739378690719604, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "learning_rate": 5.850376284287177e-05, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "loss": 0.3846653401851654, + "step": 17210 + }, + { + "ce_loss": 0.009561138227581978, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "distill_loss": 0.11265572905540466, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "ref_ce_loss": 0.07206114381551743, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "loss": 0.27818945050239563, + "step": 17210 + }, + { + "ce_loss": 0.04575086012482643, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "distill_loss": 0.12364290654659271, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "ref_ce_loss": 0.046676069498062134, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "loss": 0.31520622968673706, + "step": 17210 + }, + { + "ce_loss": 0.03989662602543831, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "distill_loss": 0.184968501329422, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "ref_ce_loss": 0.06694689393043518, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "loss": 0.31031399965286255, + "step": 17210 + }, + { + "ce_loss": 0.07887408137321472, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "distill_loss": 0.16484485566616058, + "epoch": 5.740493662441628, + "step": 17210 + }, + { + "epoch": 5.740493662441628, + "ref_ce_loss": 0.06637794524431229, + "step": 17210 + }, + { + "epoch": 5.743829219479653, + "loss": 0.3617, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "grad_norm": 2.416130781173706, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "learning_rate": 5.8343332447494786e-05, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "loss": 0.3554649353027344, + "step": 17220 + }, + { + "ce_loss": 0.08824531733989716, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "distill_loss": 0.0907503142952919, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "ref_ce_loss": 0.07792174071073532, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "loss": 0.36913350224494934, + "step": 17220 + }, + { + "ce_loss": 0.11172915250062943, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "distill_loss": 0.14446821808815002, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "ref_ce_loss": 0.08583048731088638, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "loss": 0.290985643863678, + "step": 17220 + }, + { + "ce_loss": 0.054969705641269684, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "distill_loss": 0.1072135642170906, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "ref_ce_loss": 0.07966025918722153, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "loss": 0.3522094488143921, + "step": 17220 + }, + { + "ce_loss": 0.02910817228257656, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "distill_loss": 0.1539473682641983, + "epoch": 5.743829219479653, + "step": 17220 + }, + { + "epoch": 5.743829219479653, + "ref_ce_loss": 0.06737984716892242, + "step": 17220 + }, + { + "epoch": 5.747164776517678, + "loss": 0.3585, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "grad_norm": 2.5325045585632324, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "learning_rate": 5.818306919772382e-05, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "loss": 0.3878948986530304, + "step": 17230 + }, + { + "ce_loss": 0.13507422804832458, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "distill_loss": 0.13404646515846252, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "ref_ce_loss": 0.07569249719381332, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "loss": 0.4121205806732178, + "step": 17230 + }, + { + "ce_loss": 0.14450865983963013, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "distill_loss": 0.16785818338394165, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "ref_ce_loss": 0.07864559441804886, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "loss": 0.25712525844573975, + "step": 17230 + }, + { + "ce_loss": 0.027721816673874855, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "distill_loss": 0.14876680076122284, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "ref_ce_loss": 0.05708444118499756, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "loss": 0.35496917366981506, + "step": 17230 + }, + { + "ce_loss": 0.09509100019931793, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "distill_loss": 0.13708890974521637, + "epoch": 5.747164776517678, + "step": 17230 + }, + { + "epoch": 5.747164776517678, + "ref_ce_loss": 0.08199819922447205, + "step": 17230 + }, + { + "epoch": 5.750500333555704, + "loss": 0.3272, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "grad_norm": 2.6290535926818848, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "learning_rate": 5.802297338581588e-05, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "loss": 0.29643863439559937, + "step": 17240 + }, + { + "ce_loss": 0.04670005664229393, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "distill_loss": 0.16171149909496307, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "ref_ce_loss": 0.08766219764947891, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "loss": 0.3715093433856964, + "step": 17240 + }, + { + "ce_loss": 0.05629168450832367, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "distill_loss": 0.12367063015699387, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "ref_ce_loss": 0.08564885705709457, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "loss": 0.3038267493247986, + "step": 17240 + }, + { + "ce_loss": 0.06020534783601761, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "distill_loss": 0.10410208255052567, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "ref_ce_loss": 0.05112629383802414, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "loss": 0.4775630235671997, + "step": 17240 + }, + { + "ce_loss": 0.0565582811832428, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "distill_loss": 0.23624451458454132, + "epoch": 5.750500333555704, + "step": 17240 + }, + { + "epoch": 5.750500333555704, + "ref_ce_loss": 0.06023179367184639, + "step": 17240 + }, + { + "epoch": 5.753835890593729, + "loss": 0.3725, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "grad_norm": 3.8670239448547363, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "learning_rate": 5.786304530372244e-05, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "loss": 0.2475840002298355, + "step": 17250 + }, + { + "ce_loss": 0.04578208178281784, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "distill_loss": 0.12310880422592163, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "ref_ce_loss": 0.07860726118087769, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "loss": 0.3713269531726837, + "step": 17250 + }, + { + "ce_loss": 0.08053571730852127, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "distill_loss": 0.12296618521213531, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "ref_ce_loss": 0.08550073206424713, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "loss": 0.26044490933418274, + "step": 17250 + }, + { + "ce_loss": 0.012742365710437298, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "distill_loss": 0.09784765541553497, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "ref_ce_loss": 0.07161511480808258, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "loss": 0.4403138756752014, + "step": 17250 + }, + { + "ce_loss": 0.07575368136167526, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "distill_loss": 0.13287517428398132, + "epoch": 5.753835890593729, + "step": 17250 + }, + { + "epoch": 5.753835890593729, + "ref_ce_loss": 0.062234990298748016, + "step": 17250 + }, + { + "epoch": 5.757171447631754, + "loss": 0.3399, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "grad_norm": 2.8273019790649414, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "learning_rate": 5.770328524308932e-05, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "loss": 0.3089262545108795, + "step": 17260 + }, + { + "ce_loss": 0.03648150712251663, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "distill_loss": 0.1352384388446808, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "ref_ce_loss": 0.0866464227437973, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "loss": 0.43987128138542175, + "step": 17260 + }, + { + "ce_loss": 0.14717067778110504, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "distill_loss": 0.16674935817718506, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "ref_ce_loss": 0.06602030247449875, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "loss": 0.32174310088157654, + "step": 17260 + }, + { + "ce_loss": 0.11366469413042068, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "distill_loss": 0.1256704330444336, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "ref_ce_loss": 0.06479135155677795, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "loss": 0.2875996530056, + "step": 17260 + }, + { + "ce_loss": 0.05902746692299843, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "distill_loss": 0.11191102862358093, + "epoch": 5.757171447631754, + "step": 17260 + }, + { + "epoch": 5.757171447631754, + "ref_ce_loss": 0.08170730620622635, + "step": 17260 + }, + { + "epoch": 5.76050700466978, + "loss": 0.3647, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "grad_norm": 2.8032124042510986, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "learning_rate": 5.754369349525581e-05, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "loss": 0.48637938499450684, + "step": 17270 + }, + { + "ce_loss": 0.10797074437141418, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "distill_loss": 0.14722202718257904, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "ref_ce_loss": 0.08467122912406921, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "loss": 0.4154958724975586, + "step": 17270 + }, + { + "ce_loss": 0.05239958316087723, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "distill_loss": 0.09364564716815948, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "ref_ce_loss": 0.044636160135269165, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "loss": 0.2659384310245514, + "step": 17270 + }, + { + "ce_loss": 0.03654512017965317, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "distill_loss": 0.0760873556137085, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "ref_ce_loss": 0.06807711720466614, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "loss": 0.46144965291023254, + "step": 17270 + }, + { + "ce_loss": 0.03658605366945267, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "distill_loss": 0.1725316047668457, + "epoch": 5.76050700466978, + "step": 17270 + }, + { + "epoch": 5.76050700466978, + "ref_ce_loss": 0.07646312564611435, + "step": 17270 + }, + { + "epoch": 5.763842561707805, + "loss": 0.3639, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "grad_norm": 3.59045147895813, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "learning_rate": 5.738427035125435e-05, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "loss": 0.22502562403678894, + "step": 17280 + }, + { + "ce_loss": 0.038079943507909775, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "distill_loss": 0.10309861600399017, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "ref_ce_loss": 0.05601663887500763, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "loss": 0.4142107665538788, + "step": 17280 + }, + { + "ce_loss": 0.11307626217603683, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "distill_loss": 0.12099690735340118, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "ref_ce_loss": 0.10019832104444504, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "loss": 0.3148443102836609, + "step": 17280 + }, + { + "ce_loss": 0.08613236248493195, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "distill_loss": 0.12056463956832886, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "ref_ce_loss": 0.06543400883674622, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "loss": 0.17869870364665985, + "step": 17280 + }, + { + "ce_loss": 0.029490550979971886, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "distill_loss": 0.08921138197183609, + "epoch": 5.763842561707805, + "step": 17280 + }, + { + "epoch": 5.763842561707805, + "ref_ce_loss": 0.03786267712712288, + "step": 17280 + }, + { + "epoch": 5.76717811874583, + "loss": 0.3259, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "grad_norm": 2.233980894088745, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "learning_rate": 5.722501610180984e-05, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "loss": 0.27729642391204834, + "step": 17290 + }, + { + "ce_loss": 0.05460566282272339, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "distill_loss": 0.12321322411298752, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "ref_ce_loss": 0.07130646705627441, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "loss": 0.516023576259613, + "step": 17290 + }, + { + "ce_loss": 0.05338827893137932, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "distill_loss": 0.11183921992778778, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "ref_ce_loss": 0.057988960295915604, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "loss": 0.48194584250450134, + "step": 17290 + }, + { + "ce_loss": 0.06910925358533859, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "distill_loss": 0.221344456076622, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "ref_ce_loss": 0.08294906467199326, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "loss": 0.2483086884021759, + "step": 17290 + }, + { + "ce_loss": 0.05446473881602287, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "distill_loss": 0.1190895289182663, + "epoch": 5.76717811874583, + "step": 17290 + }, + { + "epoch": 5.76717811874583, + "ref_ce_loss": 0.07463015615940094, + "step": 17290 + }, + { + "epoch": 5.770513675783856, + "loss": 0.3169, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "grad_norm": 1.6905715465545654, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "learning_rate": 5.706593103733926e-05, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "loss": 0.36666467785835266, + "step": 17300 + }, + { + "ce_loss": 0.1196926012635231, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "distill_loss": 0.12403534352779388, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "ref_ce_loss": 0.09720993787050247, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "loss": 0.23492708802223206, + "step": 17300 + }, + { + "ce_loss": 0.047469962388277054, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "distill_loss": 0.13368260860443115, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "ref_ce_loss": 0.05329669639468193, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "loss": 0.24395941197872162, + "step": 17300 + }, + { + "ce_loss": 0.03127888962626457, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "distill_loss": 0.1213611513376236, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "ref_ce_loss": 0.09058709442615509, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "loss": 0.39396345615386963, + "step": 17300 + }, + { + "ce_loss": 0.09582968801259995, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "distill_loss": 0.19662624597549438, + "epoch": 5.770513675783856, + "step": 17300 + }, + { + "epoch": 5.770513675783856, + "ref_ce_loss": 0.10104110091924667, + "step": 17300 + }, + { + "epoch": 5.773849232821881, + "loss": 0.3468, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "grad_norm": 2.5617034435272217, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "learning_rate": 5.690701544795092e-05, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "loss": 0.193987637758255, + "step": 17310 + }, + { + "ce_loss": 0.0359039269387722, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "distill_loss": 0.08433974534273148, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "ref_ce_loss": 0.05012137070298195, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "loss": 0.3732142746448517, + "step": 17310 + }, + { + "ce_loss": 0.07121843844652176, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "distill_loss": 0.1710732877254486, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "ref_ce_loss": 0.06881940364837646, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "loss": 0.21098724007606506, + "step": 17310 + }, + { + "ce_loss": 0.024630699306726456, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "distill_loss": 0.13975226879119873, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "ref_ce_loss": 0.046379558742046356, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "loss": 0.261945515871048, + "step": 17310 + }, + { + "ce_loss": 0.03864102438092232, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "distill_loss": 0.11703591793775558, + "epoch": 5.773849232821881, + "step": 17310 + }, + { + "epoch": 5.773849232821881, + "ref_ce_loss": 0.055460765957832336, + "step": 17310 + }, + { + "epoch": 5.7771847898599065, + "loss": 0.3371, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "grad_norm": 4.389605522155762, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "learning_rate": 5.6748269623444264e-05, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "loss": 0.21445885300636292, + "step": 17320 + }, + { + "ce_loss": 0.037108927965164185, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "distill_loss": 0.0989280715584755, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "ref_ce_loss": 0.029425648972392082, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "loss": 0.3353267312049866, + "step": 17320 + }, + { + "ce_loss": 0.040188174694776535, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "distill_loss": 0.0841199979186058, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "ref_ce_loss": 0.07783445715904236, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "loss": 0.2943035066127777, + "step": 17320 + }, + { + "ce_loss": 0.06536614894866943, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "distill_loss": 0.12621468305587769, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "ref_ce_loss": 0.07469505071640015, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "loss": 0.2589566707611084, + "step": 17320 + }, + { + "ce_loss": 0.030605845153331757, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "distill_loss": 0.11211664229631424, + "epoch": 5.7771847898599065, + "step": 17320 + }, + { + "epoch": 5.7771847898599065, + "ref_ce_loss": 0.06319501250982285, + "step": 17320 + }, + { + "epoch": 5.780520346897932, + "loss": 0.3417, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "grad_norm": 2.9060959815979004, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "learning_rate": 5.658969385330891e-05, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "loss": 0.506881833076477, + "step": 17330 + }, + { + "ce_loss": 0.07976707071065903, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "distill_loss": 0.09284459054470062, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "ref_ce_loss": 0.07745490968227386, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "loss": 0.4200765788555145, + "step": 17330 + }, + { + "ce_loss": 0.07465557754039764, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "distill_loss": 0.14287123084068298, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "ref_ce_loss": 0.06909674406051636, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "loss": 0.2414490431547165, + "step": 17330 + }, + { + "ce_loss": 0.05098666995763779, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "distill_loss": 0.1095428466796875, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "ref_ce_loss": 0.05398302897810936, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "loss": 0.33691316843032837, + "step": 17330 + }, + { + "ce_loss": 0.11903326958417892, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "distill_loss": 0.11317283660173416, + "epoch": 5.780520346897932, + "step": 17330 + }, + { + "epoch": 5.780520346897932, + "ref_ce_loss": 0.08404218405485153, + "step": 17330 + }, + { + "epoch": 5.783855903935957, + "loss": 0.3367, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "grad_norm": 2.7188363075256348, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "learning_rate": 5.643128842672467e-05, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "loss": 0.47430357336997986, + "step": 17340 + }, + { + "ce_loss": 0.076082244515419, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "distill_loss": 0.1695503294467926, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "ref_ce_loss": 0.06374223530292511, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "loss": 0.36884844303131104, + "step": 17340 + }, + { + "ce_loss": 0.0587586909532547, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "distill_loss": 0.11374857276678085, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "ref_ce_loss": 0.08658315241336823, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "loss": 0.432818740606308, + "step": 17340 + }, + { + "ce_loss": 0.09153823554515839, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "distill_loss": 0.2312244325876236, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "ref_ce_loss": 0.08305595070123672, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "loss": 0.4313298761844635, + "step": 17340 + }, + { + "ce_loss": 0.060097139328718185, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "distill_loss": 0.13053864240646362, + "epoch": 5.783855903935957, + "step": 17340 + }, + { + "epoch": 5.783855903935957, + "ref_ce_loss": 0.08896686881780624, + "step": 17340 + }, + { + "epoch": 5.7871914609739825, + "loss": 0.4102, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "grad_norm": 4.448433876037598, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "learning_rate": 5.627305363256054e-05, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "loss": 0.36065179109573364, + "step": 17350 + }, + { + "ce_loss": 0.07306523621082306, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "distill_loss": 0.17802460491657257, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "ref_ce_loss": 0.08533138036727905, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "loss": 0.3039696216583252, + "step": 17350 + }, + { + "ce_loss": 0.06814741343259811, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "distill_loss": 0.13009023666381836, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "ref_ce_loss": 0.07732559740543365, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "loss": 0.2832317054271698, + "step": 17350 + }, + { + "ce_loss": 0.0839838832616806, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "distill_loss": 0.09487424045801163, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "ref_ce_loss": 0.06302163749933243, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "loss": 0.34753546118736267, + "step": 17350 + }, + { + "ce_loss": 0.0784483551979065, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "distill_loss": 0.17030097544193268, + "epoch": 5.7871914609739825, + "step": 17350 + }, + { + "epoch": 5.7871914609739825, + "ref_ce_loss": 0.09857328981161118, + "step": 17350 + }, + { + "epoch": 5.790527018012008, + "loss": 0.3631, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "grad_norm": 3.4604597091674805, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "learning_rate": 5.6114989759374264e-05, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "loss": 0.3250521421432495, + "step": 17360 + }, + { + "ce_loss": 0.06812068819999695, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "distill_loss": 0.14425088465213776, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "ref_ce_loss": 0.05242958292365074, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "loss": 0.38723093271255493, + "step": 17360 + }, + { + "ce_loss": 0.04637950658798218, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "distill_loss": 0.12929755449295044, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "ref_ce_loss": 0.09616874158382416, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "loss": 0.4408058822154999, + "step": 17360 + }, + { + "ce_loss": 0.10426130890846252, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "distill_loss": 0.13043643534183502, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "ref_ce_loss": 0.10126718878746033, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "loss": 0.184561625123024, + "step": 17360 + }, + { + "ce_loss": 0.028995148837566376, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "distill_loss": 0.10370610654354095, + "epoch": 5.790527018012008, + "step": 17360 + }, + { + "epoch": 5.790527018012008, + "ref_ce_loss": 0.05134010314941406, + "step": 17360 + }, + { + "epoch": 5.793862575050033, + "loss": 0.3432, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "grad_norm": 3.010852336883545, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "learning_rate": 5.595709709541212e-05, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "loss": 0.4072456359863281, + "step": 17370 + }, + { + "ce_loss": 0.06752721965312958, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "distill_loss": 0.12647153437137604, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "ref_ce_loss": 0.07747430354356766, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "loss": 0.370725154876709, + "step": 17370 + }, + { + "ce_loss": 0.05985059216618538, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "distill_loss": 0.14739835262298584, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "ref_ce_loss": 0.059420146048069, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "loss": 0.28383949398994446, + "step": 17370 + }, + { + "ce_loss": 0.06301853060722351, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "distill_loss": 0.11598997563123703, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "ref_ce_loss": 0.09168814867734909, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "loss": 0.2755222022533417, + "step": 17370 + }, + { + "ce_loss": 0.04786836355924606, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "distill_loss": 0.15717007219791412, + "epoch": 5.793862575050033, + "step": 17370 + }, + { + "epoch": 5.793862575050033, + "ref_ce_loss": 0.053128596395254135, + "step": 17370 + }, + { + "epoch": 5.797198132088059, + "loss": 0.35, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "grad_norm": 1.6434465646743774, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "learning_rate": 5.5799375928607897e-05, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "loss": 0.7022756338119507, + "step": 17380 + }, + { + "ce_loss": 0.09961540251970291, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "distill_loss": 0.19559799134731293, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "ref_ce_loss": 0.12969614565372467, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "loss": 0.44806501269340515, + "step": 17380 + }, + { + "ce_loss": 0.12347184866666794, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "distill_loss": 0.1917111724615097, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "ref_ce_loss": 0.09800612181425095, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "loss": 0.27114999294281006, + "step": 17380 + }, + { + "ce_loss": 0.04157739877700806, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "distill_loss": 0.13314248621463776, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "ref_ce_loss": 0.06825023144483566, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "loss": 0.30957531929016113, + "step": 17380 + }, + { + "ce_loss": 0.05008748173713684, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "distill_loss": 0.1175016239285469, + "epoch": 5.797198132088059, + "step": 17380 + }, + { + "epoch": 5.797198132088059, + "ref_ce_loss": 0.06017220392823219, + "step": 17380 + }, + { + "epoch": 5.800533689126084, + "loss": 0.343, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "grad_norm": 1.723335862159729, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "learning_rate": 5.5641826546582844e-05, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "loss": 0.22227926552295685, + "step": 17390 + }, + { + "ce_loss": 0.054145898669958115, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "distill_loss": 0.12056168913841248, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "ref_ce_loss": 0.046798594295978546, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "loss": 0.27474644780158997, + "step": 17390 + }, + { + "ce_loss": 0.031725261360406876, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "distill_loss": 0.1447744369506836, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "ref_ce_loss": 0.06469041854143143, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "loss": 0.42471349239349365, + "step": 17390 + }, + { + "ce_loss": 0.06565036624670029, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "distill_loss": 0.15499937534332275, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "ref_ce_loss": 0.0806792750954628, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "loss": 0.43315747380256653, + "step": 17390 + }, + { + "ce_loss": 0.13766764104366302, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "distill_loss": 0.16289286315441132, + "epoch": 5.800533689126084, + "step": 17390 + }, + { + "epoch": 5.800533689126084, + "ref_ce_loss": 0.06286300718784332, + "step": 17390 + }, + { + "epoch": 5.803869246164109, + "loss": 0.352, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "grad_norm": 2.084108352661133, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "learning_rate": 5.548444923664499e-05, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "loss": 0.4230390191078186, + "step": 17400 + }, + { + "ce_loss": 0.12096190452575684, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "distill_loss": 0.1789836287498474, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "ref_ce_loss": 0.12275371700525284, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "loss": 0.24342182278633118, + "step": 17400 + }, + { + "ce_loss": 0.04757703095674515, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "distill_loss": 0.09931950271129608, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "ref_ce_loss": 0.06264876574277878, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "loss": 0.37538662552833557, + "step": 17400 + }, + { + "ce_loss": 0.026338813826441765, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "distill_loss": 0.21343332529067993, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "ref_ce_loss": 0.06337901949882507, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "loss": 0.3582431972026825, + "step": 17400 + }, + { + "ce_loss": 0.11757601052522659, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "distill_loss": 0.14947518706321716, + "epoch": 5.803869246164109, + "step": 17400 + }, + { + "epoch": 5.803869246164109, + "ref_ce_loss": 0.09114629030227661, + "step": 17400 + }, + { + "epoch": 5.807204803202135, + "loss": 0.3419, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "grad_norm": 3.1050589084625244, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "learning_rate": 5.532724428578834e-05, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "loss": 0.2950383722782135, + "step": 17410 + }, + { + "ce_loss": 0.019280388951301575, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "distill_loss": 0.14772720634937286, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "ref_ce_loss": 0.08486374467611313, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "loss": 0.664941668510437, + "step": 17410 + }, + { + "ce_loss": 0.12758208811283112, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "distill_loss": 0.15463386476039886, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "ref_ce_loss": 0.09197648614645004, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "loss": 0.510972797870636, + "step": 17410 + }, + { + "ce_loss": 0.1270759105682373, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "distill_loss": 0.14177629351615906, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "ref_ce_loss": 0.09685118496417999, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "loss": 0.34812983870506287, + "step": 17410 + }, + { + "ce_loss": 0.12147245556116104, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "distill_loss": 0.1561581939458847, + "epoch": 5.807204803202135, + "step": 17410 + }, + { + "epoch": 5.807204803202135, + "ref_ce_loss": 0.07011862099170685, + "step": 17410 + }, + { + "epoch": 5.81054036024016, + "loss": 0.3592, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "grad_norm": 1.6063811779022217, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "learning_rate": 5.517021198069276e-05, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "loss": 0.292193204164505, + "step": 17420 + }, + { + "ce_loss": 0.05114936828613281, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "distill_loss": 0.13860230147838593, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "ref_ce_loss": 0.06149422004818916, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "loss": 0.3692050278186798, + "step": 17420 + }, + { + "ce_loss": 0.07337262481451035, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "distill_loss": 0.1356513649225235, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "ref_ce_loss": 0.06378420442342758, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "loss": 0.42367690801620483, + "step": 17420 + }, + { + "ce_loss": 0.0620979443192482, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "distill_loss": 0.1174180656671524, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "ref_ce_loss": 0.05651075392961502, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "loss": 0.3047388792037964, + "step": 17420 + }, + { + "ce_loss": 0.09635266661643982, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "distill_loss": 0.15084218978881836, + "epoch": 5.81054036024016, + "step": 17420 + }, + { + "epoch": 5.81054036024016, + "ref_ce_loss": 0.05727013573050499, + "step": 17420 + }, + { + "epoch": 5.813875917278185, + "loss": 0.3478, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "grad_norm": 2.7375216484069824, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "learning_rate": 5.501335260772329e-05, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "loss": 0.2118167281150818, + "step": 17430 + }, + { + "ce_loss": 0.04239267483353615, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "distill_loss": 0.10191035270690918, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "ref_ce_loss": 0.03936781361699104, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "loss": 0.49797767400741577, + "step": 17430 + }, + { + "ce_loss": 0.045204028487205505, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "distill_loss": 0.10383464395999908, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "ref_ce_loss": 0.05662389472126961, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "loss": 0.3084079623222351, + "step": 17430 + }, + { + "ce_loss": 0.07492610812187195, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "distill_loss": 0.11228092014789581, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "ref_ce_loss": 0.11168135702610016, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "loss": 0.4190921187400818, + "step": 17430 + }, + { + "ce_loss": 0.08887006342411041, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "distill_loss": 0.13845597207546234, + "epoch": 5.813875917278185, + "step": 17430 + }, + { + "epoch": 5.813875917278185, + "ref_ce_loss": 0.08437550812959671, + "step": 17430 + }, + { + "epoch": 5.817211474316211, + "loss": 0.3496, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "grad_norm": 3.1267213821411133, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "learning_rate": 5.4856666452929435e-05, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "loss": 0.35733237862586975, + "step": 17440 + }, + { + "ce_loss": 0.08284957706928253, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "distill_loss": 0.16209322214126587, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "ref_ce_loss": 0.07235975563526154, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "loss": 0.3992215096950531, + "step": 17440 + }, + { + "ce_loss": 0.09641101956367493, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "distill_loss": 0.1932550072669983, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "ref_ce_loss": 0.08727872371673584, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "loss": 0.23189151287078857, + "step": 17440 + }, + { + "ce_loss": 0.04314230754971504, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "distill_loss": 0.10661713778972626, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "ref_ce_loss": 0.05453113839030266, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "loss": 0.4198133945465088, + "step": 17440 + }, + { + "ce_loss": 0.07765809446573257, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "distill_loss": 0.23638515174388885, + "epoch": 5.817211474316211, + "step": 17440 + }, + { + "epoch": 5.817211474316211, + "ref_ce_loss": 0.08266009390354156, + "step": 17440 + }, + { + "epoch": 5.820547031354236, + "loss": 0.3357, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "grad_norm": 1.607082486152649, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "learning_rate": 5.470015380204498e-05, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "loss": 0.4796401858329773, + "step": 17450 + }, + { + "ce_loss": 0.1269017606973648, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "distill_loss": 0.15161138772964478, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "ref_ce_loss": 0.13180014491081238, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "loss": 0.3101658225059509, + "step": 17450 + }, + { + "ce_loss": 0.06727251410484314, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "distill_loss": 0.14738528430461884, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "ref_ce_loss": 0.062444526702165604, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "loss": 0.3178739845752716, + "step": 17450 + }, + { + "ce_loss": 0.06257525831460953, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "distill_loss": 0.0920637771487236, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "ref_ce_loss": 0.07175959646701813, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "loss": 0.30642449855804443, + "step": 17450 + }, + { + "ce_loss": 0.050568338483572006, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "distill_loss": 0.09929658472537994, + "epoch": 5.820547031354236, + "step": 17450 + }, + { + "epoch": 5.820547031354236, + "ref_ce_loss": 0.05786219611763954, + "step": 17450 + }, + { + "epoch": 5.823882588392261, + "loss": 0.3484, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "grad_norm": 2.263108730316162, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "learning_rate": 5.454381494048726e-05, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "loss": 0.24826695024967194, + "step": 17460 + }, + { + "ce_loss": 0.06059538573026657, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "distill_loss": 0.1367242932319641, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "ref_ce_loss": 0.0508265420794487, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "loss": 0.2540883719921112, + "step": 17460 + }, + { + "ce_loss": 0.02674366906285286, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "distill_loss": 0.08802241832017899, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "ref_ce_loss": 0.06723759323358536, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "loss": 0.3104031980037689, + "step": 17460 + }, + { + "ce_loss": 0.06915073096752167, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "distill_loss": 0.12898601591587067, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "ref_ce_loss": 0.05971602350473404, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "loss": 0.28427770733833313, + "step": 17460 + }, + { + "ce_loss": 0.037224024534225464, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "distill_loss": 0.11982965469360352, + "epoch": 5.823882588392261, + "step": 17460 + }, + { + "epoch": 5.823882588392261, + "ref_ce_loss": 0.07563572376966476, + "step": 17460 + }, + { + "epoch": 5.827218145430287, + "loss": 0.3525, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "grad_norm": 3.3239011764526367, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "learning_rate": 5.4387650153356715e-05, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "loss": 0.37470170855522156, + "step": 17470 + }, + { + "ce_loss": 0.08236923068761826, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "distill_loss": 0.17556804418563843, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "ref_ce_loss": 0.057811666280031204, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "loss": 0.25655317306518555, + "step": 17470 + }, + { + "ce_loss": 0.06295975297689438, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "distill_loss": 0.10989764332771301, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "ref_ce_loss": 0.0832274779677391, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "loss": 0.2870965600013733, + "step": 17470 + }, + { + "ce_loss": 0.04364733025431633, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "distill_loss": 0.1371404379606247, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "ref_ce_loss": 0.07653002440929413, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "loss": 0.30099189281463623, + "step": 17470 + }, + { + "ce_loss": 0.07090586423873901, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "distill_loss": 0.11548236757516861, + "epoch": 5.827218145430287, + "step": 17470 + }, + { + "epoch": 5.827218145430287, + "ref_ce_loss": 0.07585054636001587, + "step": 17470 + }, + { + "epoch": 5.830553702468312, + "loss": 0.3701, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "grad_norm": 3.6772897243499756, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "learning_rate": 5.423165972543634e-05, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "loss": 0.49872735142707825, + "step": 17480 + }, + { + "ce_loss": 0.11087694019079208, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "distill_loss": 0.1530749797821045, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "ref_ce_loss": 0.10035675019025803, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "loss": 0.6843752861022949, + "step": 17480 + }, + { + "ce_loss": 0.09181319177150726, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "distill_loss": 0.12344271689653397, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "ref_ce_loss": 0.10776031762361526, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "loss": 0.2865215241909027, + "step": 17480 + }, + { + "ce_loss": 0.037878841161727905, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "distill_loss": 0.12363813072443008, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "ref_ce_loss": 0.07112205773591995, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "loss": 0.3730589747428894, + "step": 17480 + }, + { + "ce_loss": 0.030367298051714897, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "distill_loss": 0.15017357468605042, + "epoch": 5.830553702468312, + "step": 17480 + }, + { + "epoch": 5.830553702468312, + "ref_ce_loss": 0.05540158227086067, + "step": 17480 + }, + { + "epoch": 5.833889259506337, + "loss": 0.3894, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "grad_norm": 2.250351905822754, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "learning_rate": 5.4075843941191046e-05, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "loss": 0.2717742919921875, + "step": 17490 + }, + { + "ce_loss": 0.02255961485207081, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "distill_loss": 0.17978093028068542, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "ref_ce_loss": 0.06858284026384354, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "loss": 0.3072040379047394, + "step": 17490 + }, + { + "ce_loss": 0.056123632937669754, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "distill_loss": 0.1385165899991989, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "ref_ce_loss": 0.0832224190235138, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "loss": 0.2829488515853882, + "step": 17490 + }, + { + "ce_loss": 0.06095289811491966, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "distill_loss": 0.10436280816793442, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "ref_ce_loss": 0.05648965388536453, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "loss": 0.42669305205345154, + "step": 17490 + }, + { + "ce_loss": 0.121896892786026, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "distill_loss": 0.19189319014549255, + "epoch": 5.833889259506337, + "step": 17490 + }, + { + "epoch": 5.833889259506337, + "ref_ce_loss": 0.08206455409526825, + "step": 17490 + }, + { + "epoch": 5.837224816544363, + "loss": 0.3718, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "grad_norm": 1.9109488725662231, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "learning_rate": 5.3920203084767406e-05, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "loss": 0.24839021265506744, + "step": 17500 + }, + { + "ce_loss": 0.02857845090329647, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "distill_loss": 0.11244728416204453, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "ref_ce_loss": 0.04522886872291565, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "loss": 0.39638009667396545, + "step": 17500 + }, + { + "ce_loss": 0.09089220315217972, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "distill_loss": 0.167978435754776, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "ref_ce_loss": 0.09620627760887146, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "loss": 0.6351642608642578, + "step": 17500 + }, + { + "ce_loss": 0.08827564865350723, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "distill_loss": 0.1794801503419876, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "ref_ce_loss": 0.08518512547016144, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "loss": 0.8153314590454102, + "step": 17500 + }, + { + "ce_loss": 0.12424998730421066, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "distill_loss": 0.14850200712680817, + "epoch": 5.837224816544363, + "step": 17500 + }, + { + "epoch": 5.837224816544363, + "ref_ce_loss": 0.06928473711013794, + "step": 17500 + }, + { + "epoch": 5.840560373582388, + "loss": 0.3703, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "grad_norm": 4.421696662902832, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "learning_rate": 5.3764737439992964e-05, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "loss": 0.40214866399765015, + "step": 17510 + }, + { + "ce_loss": 0.13289524614810944, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "distill_loss": 0.17023883759975433, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "ref_ce_loss": 0.05464457720518112, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "loss": 0.2773127257823944, + "step": 17510 + }, + { + "ce_loss": 0.04269995912909508, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "distill_loss": 0.10322048515081406, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "ref_ce_loss": 0.06018978729844093, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "loss": 0.4813617765903473, + "step": 17510 + }, + { + "ce_loss": 0.0832713320851326, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "distill_loss": 0.2649557590484619, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "ref_ce_loss": 0.09463027119636536, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "loss": 0.2821495234966278, + "step": 17510 + }, + { + "ce_loss": 0.04585276544094086, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "distill_loss": 0.1394762545824051, + "epoch": 5.840560373582388, + "step": 17510 + }, + { + "epoch": 5.840560373582388, + "ref_ce_loss": 0.06220794469118118, + "step": 17510 + }, + { + "epoch": 5.8438959306204135, + "loss": 0.3736, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "grad_norm": 4.105420112609863, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "learning_rate": 5.360944729037572e-05, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "loss": 0.3745241165161133, + "step": 17520 + }, + { + "ce_loss": 0.0848168134689331, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "distill_loss": 0.1598915159702301, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "ref_ce_loss": 0.05103028565645218, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "loss": 0.7227115631103516, + "step": 17520 + }, + { + "ce_loss": 0.0658605769276619, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "distill_loss": 0.1462346315383911, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "ref_ce_loss": 0.06259340047836304, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "loss": 0.5101954936981201, + "step": 17520 + }, + { + "ce_loss": 0.02249034121632576, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "distill_loss": 0.09204475581645966, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "ref_ce_loss": 0.08358345180749893, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "loss": 0.25509950518608093, + "step": 17520 + }, + { + "ce_loss": 0.0710252895951271, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "distill_loss": 0.10144200921058655, + "epoch": 5.8438959306204135, + "step": 17520 + }, + { + "epoch": 5.8438959306204135, + "ref_ce_loss": 0.054177094250917435, + "step": 17520 + }, + { + "epoch": 5.847231487658439, + "loss": 0.3925, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "grad_norm": 3.8222227096557617, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "learning_rate": 5.345433291910368e-05, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "loss": 0.3156295418739319, + "step": 17530 + }, + { + "ce_loss": 0.04494838789105415, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "distill_loss": 0.1247267797589302, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "ref_ce_loss": 0.05149764195084572, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "loss": 0.2993130385875702, + "step": 17530 + }, + { + "ce_loss": 0.024480029940605164, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "distill_loss": 0.15972600877285004, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "ref_ce_loss": 0.0775722861289978, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "loss": 0.3961556553840637, + "step": 17530 + }, + { + "ce_loss": 0.08216272294521332, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "distill_loss": 0.20880329608917236, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "ref_ce_loss": 0.07281126081943512, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "loss": 0.37639302015304565, + "step": 17530 + }, + { + "ce_loss": 0.04775812849402428, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "distill_loss": 0.11027499288320541, + "epoch": 5.847231487658439, + "step": 17530 + }, + { + "epoch": 5.847231487658439, + "ref_ce_loss": 0.07947772741317749, + "step": 17530 + }, + { + "epoch": 5.850567044696464, + "loss": 0.3653, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "grad_norm": 2.4197661876678467, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "learning_rate": 5.3299394609044204e-05, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "loss": 0.3402426540851593, + "step": 17540 + }, + { + "ce_loss": 0.08751311153173447, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "distill_loss": 0.09833948314189911, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "ref_ce_loss": 0.08320891112089157, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "loss": 0.3248119652271271, + "step": 17540 + }, + { + "ce_loss": 0.05598754063248634, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "distill_loss": 0.13237228989601135, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "ref_ce_loss": 0.05231640860438347, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "loss": 0.21130509674549103, + "step": 17540 + }, + { + "ce_loss": 0.026743734255433083, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "distill_loss": 0.10511060059070587, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "ref_ce_loss": 0.05859754607081413, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "loss": 0.4116530418395996, + "step": 17540 + }, + { + "ce_loss": 0.08053731918334961, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "distill_loss": 0.21697655320167542, + "epoch": 5.850567044696464, + "step": 17540 + }, + { + "epoch": 5.850567044696464, + "ref_ce_loss": 0.08334068953990936, + "step": 17540 + }, + { + "epoch": 5.8539026017344895, + "loss": 0.3875, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "grad_norm": 1.7867008447647095, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "learning_rate": 5.314463264274367e-05, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "loss": 0.32216352224349976, + "step": 17550 + }, + { + "ce_loss": 0.08698655664920807, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "distill_loss": 0.12306316196918488, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "ref_ce_loss": 0.07335870712995529, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "loss": 0.3635510802268982, + "step": 17550 + }, + { + "ce_loss": 0.06758643686771393, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "distill_loss": 0.19247770309448242, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "ref_ce_loss": 0.08190000057220459, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "loss": 0.4638984799385071, + "step": 17550 + }, + { + "ce_loss": 0.09081947803497314, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "distill_loss": 0.175666943192482, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "ref_ce_loss": 0.09541428089141846, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "loss": 0.3402867913246155, + "step": 17550 + }, + { + "ce_loss": 0.08957655727863312, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "distill_loss": 0.1153869479894638, + "epoch": 5.8539026017344895, + "step": 17550 + }, + { + "epoch": 5.8539026017344895, + "ref_ce_loss": 0.09455670416355133, + "step": 17550 + }, + { + "epoch": 5.857238158772515, + "loss": 0.3689, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "grad_norm": 2.4782440662384033, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "learning_rate": 5.2990047302426894e-05, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "loss": 0.25175487995147705, + "step": 17560 + }, + { + "ce_loss": 0.018894441425800323, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "distill_loss": 0.110230453312397, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "ref_ce_loss": 0.04608863964676857, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "loss": 0.4973832666873932, + "step": 17560 + }, + { + "ce_loss": 0.09207753837108612, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "distill_loss": 0.14106489717960358, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "ref_ce_loss": 0.05099315196275711, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "loss": 0.27829593420028687, + "step": 17560 + }, + { + "ce_loss": 0.0815405324101448, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "distill_loss": 0.11980370432138443, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "ref_ce_loss": 0.053519003093242645, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "loss": 0.5003367066383362, + "step": 17560 + }, + { + "ce_loss": 0.1114412397146225, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "distill_loss": 0.17173542082309723, + "epoch": 5.857238158772515, + "step": 17560 + }, + { + "epoch": 5.857238158772515, + "ref_ce_loss": 0.09367650002241135, + "step": 17560 + }, + { + "epoch": 5.86057371581054, + "loss": 0.3489, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "grad_norm": 2.932281494140625, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "learning_rate": 5.283563886999651e-05, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "loss": 0.41849589347839355, + "step": 17570 + }, + { + "ce_loss": 0.07301030308008194, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "distill_loss": 0.14979524910449982, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "ref_ce_loss": 0.08146942406892776, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "loss": 0.2620297074317932, + "step": 17570 + }, + { + "ce_loss": 0.060599468648433685, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "distill_loss": 0.09925365447998047, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "ref_ce_loss": 0.06362560391426086, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "loss": 0.4664449691772461, + "step": 17570 + }, + { + "ce_loss": 0.04816756024956703, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "distill_loss": 0.14493735134601593, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "ref_ce_loss": 0.09040951728820801, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "loss": 0.23289237916469574, + "step": 17570 + }, + { + "ce_loss": 0.015162667259573936, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "distill_loss": 0.11945579946041107, + "epoch": 5.86057371581054, + "step": 17570 + }, + { + "epoch": 5.86057371581054, + "ref_ce_loss": 0.05992235243320465, + "step": 17570 + }, + { + "epoch": 5.863909272848566, + "loss": 0.3386, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "grad_norm": 1.6773673295974731, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "learning_rate": 5.268140762703269e-05, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "loss": 0.3034585118293762, + "step": 17580 + }, + { + "ce_loss": 0.04708658531308174, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "distill_loss": 0.12628522515296936, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "ref_ce_loss": 0.08748694509267807, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "loss": 0.3233617842197418, + "step": 17580 + }, + { + "ce_loss": 0.10379615426063538, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "distill_loss": 0.10835333913564682, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "ref_ce_loss": 0.08786173909902573, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "loss": 0.3202160596847534, + "step": 17580 + }, + { + "ce_loss": 0.042189840227365494, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "distill_loss": 0.15545186400413513, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "ref_ce_loss": 0.08296966552734375, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "loss": 0.49785640835762024, + "step": 17580 + }, + { + "ce_loss": 0.09063033759593964, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "distill_loss": 0.15018562972545624, + "epoch": 5.863909272848566, + "step": 17580 + }, + { + "epoch": 5.863909272848566, + "ref_ce_loss": 0.08623871207237244, + "step": 17580 + }, + { + "epoch": 5.867244829886591, + "loss": 0.3401, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "grad_norm": 4.520524501800537, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "learning_rate": 5.2527353854792236e-05, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "loss": 0.43522706627845764, + "step": 17590 + }, + { + "ce_loss": 0.0914255827665329, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "distill_loss": 0.191763773560524, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "ref_ce_loss": 0.07178188115358353, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "loss": 0.31930312514305115, + "step": 17590 + }, + { + "ce_loss": 0.062497545033693314, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "distill_loss": 0.186318039894104, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "ref_ce_loss": 0.05268324166536331, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "loss": 0.386909157037735, + "step": 17590 + }, + { + "ce_loss": 0.11041705310344696, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "distill_loss": 0.1834145039319992, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "ref_ce_loss": 0.09284200519323349, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "loss": 0.2399304360151291, + "step": 17590 + }, + { + "ce_loss": 0.035906922072172165, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "distill_loss": 0.08376122266054153, + "epoch": 5.867244829886591, + "step": 17590 + }, + { + "epoch": 5.867244829886591, + "ref_ce_loss": 0.03996507450938225, + "step": 17590 + }, + { + "epoch": 5.870580386924616, + "loss": 0.3953, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "grad_norm": 3.3450121879577637, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "learning_rate": 5.237347783420854e-05, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "loss": 0.6455330848693848, + "step": 17600 + }, + { + "ce_loss": 0.11936826258897781, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "distill_loss": 0.1688607782125473, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "ref_ce_loss": 0.08025792241096497, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "loss": 0.41483816504478455, + "step": 17600 + }, + { + "ce_loss": 0.072411447763443, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "distill_loss": 0.16342517733573914, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "ref_ce_loss": 0.03735579922795296, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "loss": 0.44591468572616577, + "step": 17600 + }, + { + "ce_loss": 0.036500539630651474, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "distill_loss": 0.10168357193470001, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "ref_ce_loss": 0.06447294354438782, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "loss": 0.4203828275203705, + "step": 17600 + }, + { + "ce_loss": 0.12469840794801712, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "distill_loss": 0.18332841992378235, + "epoch": 5.870580386924616, + "step": 17600 + }, + { + "epoch": 5.870580386924616, + "ref_ce_loss": 0.06972688436508179, + "step": 17600 + }, + { + "epoch": 5.873915943962642, + "loss": 0.3433, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "grad_norm": 2.724898099899292, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "learning_rate": 5.221977984589075e-05, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "loss": 0.5014619827270508, + "step": 17610 + }, + { + "ce_loss": 0.14343143999576569, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "distill_loss": 0.16040818393230438, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "ref_ce_loss": 0.09495589882135391, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "loss": 0.2154809683561325, + "step": 17610 + }, + { + "ce_loss": 0.06345026940107346, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "distill_loss": 0.10396941751241684, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "ref_ce_loss": 0.047588132321834564, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "loss": 0.28772875666618347, + "step": 17610 + }, + { + "ce_loss": 0.06648959964513779, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "distill_loss": 0.1429111659526825, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "ref_ce_loss": 0.05615722015500069, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "loss": 0.19824743270874023, + "step": 17610 + }, + { + "ce_loss": 0.057200632989406586, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "distill_loss": 0.10092482715845108, + "epoch": 5.873915943962642, + "step": 17610 + }, + { + "epoch": 5.873915943962642, + "ref_ce_loss": 0.04004830867052078, + "step": 17610 + }, + { + "epoch": 5.877251501000667, + "loss": 0.3221, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "grad_norm": 6.1942949295043945, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "learning_rate": 5.206626017012337e-05, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "loss": 0.5739012360572815, + "step": 17620 + }, + { + "ce_loss": 0.2185136377811432, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "distill_loss": 0.15722893178462982, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "ref_ce_loss": 0.11173129081726074, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "loss": 0.3570323586463928, + "step": 17620 + }, + { + "ce_loss": 0.07485233247280121, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "distill_loss": 0.12562787532806396, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "ref_ce_loss": 0.08965486288070679, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "loss": 0.5900654196739197, + "step": 17620 + }, + { + "ce_loss": 0.03555797412991524, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "distill_loss": 0.10250119864940643, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "ref_ce_loss": 0.05633779987692833, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "loss": 0.23747488856315613, + "step": 17620 + }, + { + "ce_loss": 0.024216409772634506, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "distill_loss": 0.11333998292684555, + "epoch": 5.877251501000667, + "step": 17620 + }, + { + "epoch": 5.877251501000667, + "ref_ce_loss": 0.05125695839524269, + "step": 17620 + }, + { + "epoch": 5.880587058038692, + "loss": 0.3655, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "grad_norm": 2.7308340072631836, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "learning_rate": 5.1912919086865784e-05, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "loss": 0.32668551802635193, + "step": 17630 + }, + { + "ce_loss": 0.07541286200284958, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "distill_loss": 0.14657814800739288, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "ref_ce_loss": 0.07333363592624664, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "loss": 0.2846378684043884, + "step": 17630 + }, + { + "ce_loss": 0.04143878072500229, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "distill_loss": 0.13445129990577698, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "ref_ce_loss": 0.07378637790679932, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "loss": 0.24098753929138184, + "step": 17630 + }, + { + "ce_loss": 0.029380042105913162, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "distill_loss": 0.118621826171875, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "ref_ce_loss": 0.06290895491838455, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "loss": 0.3457111418247223, + "step": 17630 + }, + { + "ce_loss": 0.05599174648523331, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "distill_loss": 0.13150471448898315, + "epoch": 5.880587058038692, + "step": 17630 + }, + { + "epoch": 5.880587058038692, + "ref_ce_loss": 0.08102560043334961, + "step": 17630 + }, + { + "epoch": 5.883922615076718, + "loss": 0.3634, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "grad_norm": 2.354527711868286, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "learning_rate": 5.1759756875751543e-05, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "loss": 0.4567744731903076, + "step": 17640 + }, + { + "ce_loss": 0.06487561762332916, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "distill_loss": 0.14871536195278168, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "ref_ce_loss": 0.1132284477353096, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "loss": 0.3961792290210724, + "step": 17640 + }, + { + "ce_loss": 0.11353077739477158, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "distill_loss": 0.12885455787181854, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "ref_ce_loss": 0.09726857393980026, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "loss": 0.32877179980278015, + "step": 17640 + }, + { + "ce_loss": 0.0983106791973114, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "distill_loss": 0.15973711013793945, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "ref_ce_loss": 0.07053600996732712, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "loss": 0.19096627831459045, + "step": 17640 + }, + { + "ce_loss": 0.03290770947933197, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "distill_loss": 0.11705254018306732, + "epoch": 5.883922615076718, + "step": 17640 + }, + { + "epoch": 5.883922615076718, + "ref_ce_loss": 0.04061080887913704, + "step": 17640 + }, + { + "epoch": 5.887258172114743, + "loss": 0.3413, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "grad_norm": 3.297376871109009, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "learning_rate": 5.160677381608814e-05, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "loss": 0.3749341070652008, + "step": 17650 + }, + { + "ce_loss": 0.05691100284457207, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "distill_loss": 0.12351744621992111, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "ref_ce_loss": 0.055408477783203125, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "loss": 0.43304702639579773, + "step": 17650 + }, + { + "ce_loss": 0.07342436164617538, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "distill_loss": 0.11726836860179901, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "ref_ce_loss": 0.08709575235843658, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "loss": 0.3348458409309387, + "step": 17650 + }, + { + "ce_loss": 0.019180668517947197, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "distill_loss": 0.1126505583524704, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "ref_ce_loss": 0.030465707182884216, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "loss": 0.6081949472427368, + "step": 17650 + }, + { + "ce_loss": 0.104547418653965, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "distill_loss": 0.17620466649532318, + "epoch": 5.887258172114743, + "step": 17650 + }, + { + "epoch": 5.887258172114743, + "ref_ce_loss": 0.11676531285047531, + "step": 17650 + }, + { + "epoch": 5.890593729152768, + "loss": 0.3933, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "grad_norm": 3.335609197616577, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "learning_rate": 5.14539701868564e-05, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "loss": 0.4062153100967407, + "step": 17660 + }, + { + "ce_loss": 0.07004745304584503, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "distill_loss": 0.16154888272285461, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "ref_ce_loss": 0.07381752133369446, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "loss": 0.5447138547897339, + "step": 17660 + }, + { + "ce_loss": 0.10842008143663406, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "distill_loss": 0.3149142861366272, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "ref_ce_loss": 0.12105908244848251, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "loss": 0.37604960799217224, + "step": 17660 + }, + { + "ce_loss": 0.07257735729217529, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "distill_loss": 0.1173940971493721, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "ref_ce_loss": 0.09473676234483719, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "loss": 0.3164910078048706, + "step": 17660 + }, + { + "ce_loss": 0.03366474434733391, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "distill_loss": 0.1109549030661583, + "epoch": 5.890593729152768, + "step": 17660 + }, + { + "epoch": 5.890593729152768, + "ref_ce_loss": 0.08360662311315536, + "step": 17660 + }, + { + "epoch": 5.893929286190794, + "loss": 0.3591, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "grad_norm": 2.3944671154022217, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "learning_rate": 5.1301346266709684e-05, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "loss": 0.21314899623394012, + "step": 17670 + }, + { + "ce_loss": 0.03496822342276573, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "distill_loss": 0.11594592034816742, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "ref_ce_loss": 0.047304894775152206, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "loss": 0.467916876077652, + "step": 17670 + }, + { + "ce_loss": 0.15920695662498474, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "distill_loss": 0.15867634117603302, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "ref_ce_loss": 0.07394873350858688, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "loss": 0.19804324209690094, + "step": 17670 + }, + { + "ce_loss": 0.018038207665085793, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "distill_loss": 0.09465809911489487, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "ref_ce_loss": 0.05995374172925949, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "loss": 0.4456608295440674, + "step": 17670 + }, + { + "ce_loss": 0.13386403024196625, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "distill_loss": 0.17635229229927063, + "epoch": 5.893929286190794, + "step": 17670 + }, + { + "epoch": 5.893929286190794, + "ref_ce_loss": 0.07214650511741638, + "step": 17670 + }, + { + "epoch": 5.897264843228819, + "loss": 0.3387, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "grad_norm": 3.302736759185791, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "learning_rate": 5.114890233397405e-05, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "loss": 0.3278179168701172, + "step": 17680 + }, + { + "ce_loss": 0.06378965824842453, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "distill_loss": 0.10937260091304779, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "ref_ce_loss": 0.08671337366104126, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "loss": 1.0311903953552246, + "step": 17680 + }, + { + "ce_loss": 0.1758325695991516, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "distill_loss": 0.17192047834396362, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "ref_ce_loss": 0.07445625215768814, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "loss": 0.46555769443511963, + "step": 17680 + }, + { + "ce_loss": 0.0498911514878273, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "distill_loss": 0.1123102605342865, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "ref_ce_loss": 0.047028716653585434, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "loss": 0.6143039464950562, + "step": 17680 + }, + { + "ce_loss": 0.12528234720230103, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "distill_loss": 0.15658093988895416, + "epoch": 5.897264843228819, + "step": 17680 + }, + { + "epoch": 5.897264843228819, + "ref_ce_loss": 0.0682942271232605, + "step": 17680 + }, + { + "epoch": 5.900600400266844, + "loss": 0.3652, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "grad_norm": 2.636820077896118, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "learning_rate": 5.0996638666646916e-05, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "loss": 0.4062114357948303, + "step": 17690 + }, + { + "ce_loss": 0.05009109526872635, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "distill_loss": 0.15433265268802643, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "ref_ce_loss": 0.09808304905891418, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "loss": 0.6093151569366455, + "step": 17690 + }, + { + "ce_loss": 0.09959680587053299, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "distill_loss": 0.1387074738740921, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "ref_ce_loss": 0.05216062441468239, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "loss": 0.3263792395591736, + "step": 17690 + }, + { + "ce_loss": 0.09031203389167786, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "distill_loss": 0.13475021719932556, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "ref_ce_loss": 0.06634215265512466, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "loss": 0.35873445868492126, + "step": 17690 + }, + { + "ce_loss": 0.069797083735466, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "distill_loss": 0.15537424385547638, + "epoch": 5.900600400266844, + "step": 17690 + }, + { + "epoch": 5.900600400266844, + "ref_ce_loss": 0.11273673176765442, + "step": 17690 + }, + { + "epoch": 5.90393595730487, + "loss": 0.3705, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "grad_norm": 1.9607138633728027, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "learning_rate": 5.084455554239724e-05, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "loss": 0.2984341084957123, + "step": 17700 + }, + { + "ce_loss": 0.04308363050222397, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "distill_loss": 0.1266622692346573, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "ref_ce_loss": 0.06170135736465454, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "loss": 0.2519134283065796, + "step": 17700 + }, + { + "ce_loss": 0.03667460009455681, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "distill_loss": 0.12148652225732803, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "ref_ce_loss": 0.05401930958032608, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "loss": 0.42079973220825195, + "step": 17700 + }, + { + "ce_loss": 0.07386600226163864, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "distill_loss": 0.20743930339813232, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "ref_ce_loss": 0.10644058138132095, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "loss": 0.5361905694007874, + "step": 17700 + }, + { + "ce_loss": 0.08765469491481781, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "distill_loss": 0.13080808520317078, + "epoch": 5.90393595730487, + "step": 17700 + }, + { + "epoch": 5.90393595730487, + "ref_ce_loss": 0.06986983120441437, + "step": 17700 + }, + { + "epoch": 5.907271514342895, + "loss": 0.3395, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "grad_norm": 2.605874538421631, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "learning_rate": 5.069265323856464e-05, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "loss": 0.3166840672492981, + "step": 17710 + }, + { + "ce_loss": 0.07511896640062332, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "distill_loss": 0.11693719774484634, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "ref_ce_loss": 0.04918194189667702, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "loss": 0.2965814471244812, + "step": 17710 + }, + { + "ce_loss": 0.029125245288014412, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "distill_loss": 0.1444043070077896, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "ref_ce_loss": 0.05168358236551285, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "loss": 0.3079814612865448, + "step": 17710 + }, + { + "ce_loss": 0.07331456243991852, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "distill_loss": 0.16929419338703156, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "ref_ce_loss": 0.0648675486445427, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "loss": 0.2419438362121582, + "step": 17710 + }, + { + "ce_loss": 0.032821644097566605, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "distill_loss": 0.16019362211227417, + "epoch": 5.907271514342895, + "step": 17710 + }, + { + "epoch": 5.907271514342895, + "ref_ce_loss": 0.04862841218709946, + "step": 17710 + }, + { + "epoch": 5.9106070713809205, + "loss": 0.347, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "grad_norm": 2.5357227325439453, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "learning_rate": 5.054093203215896e-05, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "loss": 0.40148186683654785, + "step": 17720 + }, + { + "ce_loss": 0.11128007620573044, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "distill_loss": 0.18451912701129913, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "ref_ce_loss": 0.10536003857851028, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "loss": 0.21454600989818573, + "step": 17720 + }, + { + "ce_loss": 0.060639139264822006, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "distill_loss": 0.08433546870946884, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "ref_ce_loss": 0.05175725743174553, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "loss": 0.42682793736457825, + "step": 17720 + }, + { + "ce_loss": 0.08756023645401001, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "distill_loss": 0.17866460978984833, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "ref_ce_loss": 0.090923972427845, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "loss": 0.23504258692264557, + "step": 17720 + }, + { + "ce_loss": 0.03457394242286682, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "distill_loss": 0.10484711080789566, + "epoch": 5.9106070713809205, + "step": 17720 + }, + { + "epoch": 5.9106070713809205, + "ref_ce_loss": 0.061616200953722, + "step": 17720 + }, + { + "epoch": 5.913942628418946, + "loss": 0.3788, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "grad_norm": 2.757432460784912, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "learning_rate": 5.038939219985979e-05, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "loss": 0.7194844484329224, + "step": 17730 + }, + { + "ce_loss": 0.11522924154996872, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "distill_loss": 0.15983763337135315, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "ref_ce_loss": 0.059831805527210236, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "loss": 0.3257623314857483, + "step": 17730 + }, + { + "ce_loss": 0.0717523992061615, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "distill_loss": 0.10491523146629333, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "ref_ce_loss": 0.08156079798936844, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "loss": 0.331621378660202, + "step": 17730 + }, + { + "ce_loss": 0.0417015366256237, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "distill_loss": 0.1441517323255539, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "ref_ce_loss": 0.06556253135204315, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "loss": 0.24882987141609192, + "step": 17730 + }, + { + "ce_loss": 0.04481315240263939, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "distill_loss": 0.12070439755916595, + "epoch": 5.913942628418946, + "step": 17730 + }, + { + "epoch": 5.913942628418946, + "ref_ce_loss": 0.04913991317152977, + "step": 17730 + }, + { + "epoch": 5.917278185456971, + "loss": 0.3517, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "grad_norm": 2.0407073497772217, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "learning_rate": 5.023803401801618e-05, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "loss": 0.3508586287498474, + "step": 17740 + }, + { + "ce_loss": 0.04194953292608261, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "distill_loss": 0.12034259736537933, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "ref_ce_loss": 0.06494162231683731, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "loss": 0.22291912138462067, + "step": 17740 + }, + { + "ce_loss": 0.0345107764005661, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "distill_loss": 0.09842812269926071, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "ref_ce_loss": 0.049962058663368225, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "loss": 0.39541149139404297, + "step": 17740 + }, + { + "ce_loss": 0.08839607238769531, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "distill_loss": 0.15726499259471893, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "ref_ce_loss": 0.08510472625494003, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "loss": 0.2958389222621918, + "step": 17740 + }, + { + "ce_loss": 0.07011163234710693, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "distill_loss": 0.13538995385169983, + "epoch": 5.917278185456971, + "step": 17740 + }, + { + "epoch": 5.917278185456971, + "ref_ce_loss": 0.0671728178858757, + "step": 17740 + }, + { + "epoch": 5.9206137424949965, + "loss": 0.3611, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "grad_norm": 3.440727949142456, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "learning_rate": 5.0086857762645574e-05, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "loss": 0.2400866150856018, + "step": 17750 + }, + { + "ce_loss": 0.04839707911014557, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "distill_loss": 0.12277629971504211, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "ref_ce_loss": 0.03812626749277115, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "loss": 0.33886978030204773, + "step": 17750 + }, + { + "ce_loss": 0.053880248218774796, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "distill_loss": 0.19124048948287964, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "ref_ce_loss": 0.06993494182825089, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "loss": 0.4030514359474182, + "step": 17750 + }, + { + "ce_loss": 0.0309885386377573, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "distill_loss": 0.12448279559612274, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "ref_ce_loss": 0.06645922362804413, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "loss": 0.3506067395210266, + "step": 17750 + }, + { + "ce_loss": 0.09792735427618027, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "distill_loss": 0.15322014689445496, + "epoch": 5.9206137424949965, + "step": 17750 + }, + { + "epoch": 5.9206137424949965, + "ref_ce_loss": 0.05199942737817764, + "step": 17750 + }, + { + "epoch": 5.923949299533022, + "loss": 0.3787, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "grad_norm": 1.9704997539520264, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "learning_rate": 4.9935863709433945e-05, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "loss": 0.33312490582466125, + "step": 17760 + }, + { + "ce_loss": 0.09216049313545227, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "distill_loss": 0.11537503451108932, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "ref_ce_loss": 0.06690085679292679, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "loss": 0.6168229579925537, + "step": 17760 + }, + { + "ce_loss": 0.1484738290309906, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "distill_loss": 0.18674319982528687, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "ref_ce_loss": 0.13398297131061554, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "loss": 0.4197208285331726, + "step": 17760 + }, + { + "ce_loss": 0.08689359575510025, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "distill_loss": 0.15195444226264954, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "ref_ce_loss": 0.045860983431339264, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "loss": 0.622891902923584, + "step": 17760 + }, + { + "ce_loss": 0.13707229495048523, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "distill_loss": 0.2516328692436218, + "epoch": 5.923949299533022, + "step": 17760 + }, + { + "epoch": 5.923949299533022, + "ref_ce_loss": 0.09355565160512924, + "step": 17760 + }, + { + "epoch": 5.927284856571047, + "loss": 0.3647, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "grad_norm": 3.1013424396514893, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "learning_rate": 4.978505213373479e-05, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "loss": 0.25511544942855835, + "step": 17770 + }, + { + "ce_loss": 0.04221971705555916, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "distill_loss": 0.10919360816478729, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "ref_ce_loss": 0.07045939564704895, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "loss": 0.39829209446907043, + "step": 17770 + }, + { + "ce_loss": 0.13580723106861115, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "distill_loss": 0.16190184652805328, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "ref_ce_loss": 0.08752710372209549, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "loss": 0.49762338399887085, + "step": 17770 + }, + { + "ce_loss": 0.11486893892288208, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "distill_loss": 0.1458262801170349, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "ref_ce_loss": 0.08525654673576355, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "loss": 0.34713953733444214, + "step": 17770 + }, + { + "ce_loss": 0.13515137135982513, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "distill_loss": 0.13119323551654816, + "epoch": 5.927284856571047, + "step": 17770 + }, + { + "epoch": 5.927284856571047, + "ref_ce_loss": 0.06520017981529236, + "step": 17770 + }, + { + "epoch": 5.9306204136090725, + "loss": 0.3589, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "grad_norm": 4.23400354385376, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "learning_rate": 4.9634423310568963e-05, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "loss": 0.4135797321796417, + "step": 17780 + }, + { + "ce_loss": 0.04827263206243515, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "distill_loss": 0.16114744544029236, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "ref_ce_loss": 0.09109925478696823, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "loss": 0.7013535499572754, + "step": 17780 + }, + { + "ce_loss": 0.04648328199982643, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "distill_loss": 0.1734389215707779, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "ref_ce_loss": 0.12075760960578918, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "loss": 0.32692864537239075, + "step": 17780 + }, + { + "ce_loss": 0.08308849483728409, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "distill_loss": 0.1307058483362198, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "ref_ce_loss": 0.08304446935653687, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "loss": 0.8171455264091492, + "step": 17780 + }, + { + "ce_loss": 0.1697182059288025, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "distill_loss": 0.2494955211877823, + "epoch": 5.9306204136090725, + "step": 17780 + }, + { + "epoch": 5.9306204136090725, + "ref_ce_loss": 0.14302107691764832, + "step": 17780 + }, + { + "epoch": 5.933955970647098, + "loss": 0.4004, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "grad_norm": 3.5227556228637695, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "learning_rate": 4.948397751462402e-05, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "loss": 0.2799118161201477, + "step": 17790 + }, + { + "ce_loss": 0.02078850008547306, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "distill_loss": 0.11289118975400925, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "ref_ce_loss": 0.07155484706163406, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "loss": 0.3532387614250183, + "step": 17790 + }, + { + "ce_loss": 0.06364618241786957, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "distill_loss": 0.13902947306632996, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "ref_ce_loss": 0.07811892777681351, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "loss": 0.40037864446640015, + "step": 17790 + }, + { + "ce_loss": 0.13156820833683014, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "distill_loss": 0.16982513666152954, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "ref_ce_loss": 0.07489024102687836, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "loss": 0.32981160283088684, + "step": 17790 + }, + { + "ce_loss": 0.08601900935173035, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "distill_loss": 0.15488427877426147, + "epoch": 5.933955970647098, + "step": 17790 + }, + { + "epoch": 5.933955970647098, + "ref_ce_loss": 0.08870402723550797, + "step": 17790 + }, + { + "epoch": 5.937291527685123, + "loss": 0.3339, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "grad_norm": 2.76710844039917, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "learning_rate": 4.933371502025377e-05, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "loss": 0.19565752148628235, + "step": 17800 + }, + { + "ce_loss": 0.032000426203012466, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "distill_loss": 0.08187977969646454, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "ref_ce_loss": 0.04776681214570999, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "loss": 0.3080092668533325, + "step": 17800 + }, + { + "ce_loss": 0.05300501361489296, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "distill_loss": 0.11545910686254501, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "ref_ce_loss": 0.08720026910305023, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "loss": 0.9395288228988647, + "step": 17800 + }, + { + "ce_loss": 0.09806068241596222, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "distill_loss": 0.1455065906047821, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "ref_ce_loss": 0.098774254322052, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "loss": 0.3972787857055664, + "step": 17800 + }, + { + "ce_loss": 0.08753319829702377, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "distill_loss": 0.1433761566877365, + "epoch": 5.937291527685123, + "step": 17800 + }, + { + "epoch": 5.937291527685123, + "ref_ce_loss": 0.07781031727790833, + "step": 17800 + }, + { + "epoch": 5.940627084723149, + "loss": 0.3778, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "grad_norm": 2.1027743816375732, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "learning_rate": 4.918363610147775e-05, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "loss": 0.2537614703178406, + "step": 17810 + }, + { + "ce_loss": 0.08498618751764297, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "distill_loss": 0.10000818222761154, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "ref_ce_loss": 0.044208452105522156, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "loss": 0.34233108162879944, + "step": 17810 + }, + { + "ce_loss": 0.10245108604431152, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "distill_loss": 0.1445772349834442, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "ref_ce_loss": 0.09517716616392136, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "loss": 0.6560655236244202, + "step": 17810 + }, + { + "ce_loss": 0.07113312929868698, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "distill_loss": 0.13794416189193726, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "ref_ce_loss": 0.036306753754615784, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "loss": 0.7157806158065796, + "step": 17810 + }, + { + "ce_loss": 0.0921962708234787, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "distill_loss": 0.17530521750450134, + "epoch": 5.940627084723149, + "step": 17810 + }, + { + "epoch": 5.940627084723149, + "ref_ce_loss": 0.10351521521806717, + "step": 17810 + }, + { + "epoch": 5.943962641761174, + "loss": 0.4083, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "grad_norm": 3.1235644817352295, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "learning_rate": 4.903374103198064e-05, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "loss": 0.342007040977478, + "step": 17820 + }, + { + "ce_loss": 0.05079389363527298, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "distill_loss": 0.12523949146270752, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "ref_ce_loss": 0.06961704790592194, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "loss": 0.36865371465682983, + "step": 17820 + }, + { + "ce_loss": 0.03996824845671654, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "distill_loss": 0.21217110753059387, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "ref_ce_loss": 0.08097124099731445, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "loss": 0.33703479170799255, + "step": 17820 + }, + { + "ce_loss": 0.04888799786567688, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "distill_loss": 0.1261524111032486, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "ref_ce_loss": 0.05606861412525177, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "loss": 0.28407689929008484, + "step": 17820 + }, + { + "ce_loss": 0.03848010301589966, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "distill_loss": 0.11298099160194397, + "epoch": 5.943962641761174, + "step": 17820 + }, + { + "epoch": 5.943962641761174, + "ref_ce_loss": 0.06575914472341537, + "step": 17820 + }, + { + "epoch": 5.947298198799199, + "loss": 0.3712, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "grad_norm": 2.5720245838165283, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "learning_rate": 4.8884030085111934e-05, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "loss": 0.2565248906612396, + "step": 17830 + }, + { + "ce_loss": 0.05670475959777832, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "distill_loss": 0.1039658635854721, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "ref_ce_loss": 0.06518025696277618, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "loss": 0.32152819633483887, + "step": 17830 + }, + { + "ce_loss": 0.03222854435443878, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "distill_loss": 0.13430121541023254, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "ref_ce_loss": 0.08920446783304214, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "loss": 0.23141776025295258, + "step": 17830 + }, + { + "ce_loss": 0.07320418953895569, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "distill_loss": 0.104472815990448, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "ref_ce_loss": 0.05357253551483154, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "loss": 0.2036972939968109, + "step": 17830 + }, + { + "ce_loss": 0.02201797068119049, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "distill_loss": 0.11067456752061844, + "epoch": 5.947298198799199, + "step": 17830 + }, + { + "epoch": 5.947298198799199, + "ref_ce_loss": 0.05038437992334366, + "step": 17830 + }, + { + "epoch": 5.950633755837225, + "loss": 0.3526, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "grad_norm": 2.784125804901123, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "learning_rate": 4.8734503533885355e-05, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "loss": 0.193734809756279, + "step": 17840 + }, + { + "ce_loss": 0.012308568693697453, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "distill_loss": 0.09944108128547668, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "ref_ce_loss": 0.03873451426625252, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "loss": 0.548555314540863, + "step": 17840 + }, + { + "ce_loss": 0.07860098034143448, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "distill_loss": 0.12121152877807617, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "ref_ce_loss": 0.06695293635129929, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "loss": 0.31232643127441406, + "step": 17840 + }, + { + "ce_loss": 0.02223125286400318, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "distill_loss": 0.11335283517837524, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "ref_ce_loss": 0.054550062865018845, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "loss": 0.3241339325904846, + "step": 17840 + }, + { + "ce_loss": 0.08912857621908188, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "distill_loss": 0.11383871734142303, + "epoch": 5.950633755837225, + "step": 17840 + }, + { + "epoch": 5.950633755837225, + "ref_ce_loss": 0.06944772601127625, + "step": 17840 + }, + { + "epoch": 5.95396931287525, + "loss": 0.3676, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "grad_norm": 2.149252414703369, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "learning_rate": 4.858516165097836e-05, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "loss": 0.47968047857284546, + "step": 17850 + }, + { + "ce_loss": 0.12607181072235107, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "distill_loss": 0.13286791741847992, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "ref_ce_loss": 0.05671996250748634, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "loss": 0.3840937316417694, + "step": 17850 + }, + { + "ce_loss": 0.10201926529407501, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "distill_loss": 0.14407148957252502, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "ref_ce_loss": 0.08624617010354996, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "loss": 0.43116295337677, + "step": 17850 + }, + { + "ce_loss": 0.09212303906679153, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "distill_loss": 0.11729922145605087, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "ref_ce_loss": 0.08468250930309296, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "loss": 0.47167813777923584, + "step": 17850 + }, + { + "ce_loss": 0.15872035920619965, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "distill_loss": 0.19481000304222107, + "epoch": 5.95396931287525, + "step": 17850 + }, + { + "epoch": 5.95396931287525, + "ref_ce_loss": 0.09276537597179413, + "step": 17850 + }, + { + "epoch": 5.957304869913275, + "loss": 0.391, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "grad_norm": 2.8126001358032227, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "learning_rate": 4.8436004708731636e-05, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "loss": 0.1991313099861145, + "step": 17860 + }, + { + "ce_loss": 0.042863406240940094, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "distill_loss": 0.09574732929468155, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "ref_ce_loss": 0.046636320650577545, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "loss": 0.34296974539756775, + "step": 17860 + }, + { + "ce_loss": 0.08897802978754044, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "distill_loss": 0.13395370543003082, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "ref_ce_loss": 0.0635993480682373, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "loss": 0.47116437554359436, + "step": 17860 + }, + { + "ce_loss": 0.12424060702323914, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "distill_loss": 0.2328859567642212, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "ref_ce_loss": 0.11392879486083984, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "loss": 0.37671905755996704, + "step": 17860 + }, + { + "ce_loss": 0.12620703876018524, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "distill_loss": 0.15510450303554535, + "epoch": 5.957304869913275, + "step": 17860 + }, + { + "epoch": 5.957304869913275, + "ref_ce_loss": 0.0951317846775055, + "step": 17860 + }, + { + "epoch": 5.960640426951301, + "loss": 0.3679, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "grad_norm": 2.2110021114349365, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "learning_rate": 4.8287032979148635e-05, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "loss": 0.4523363411426544, + "step": 17870 + }, + { + "ce_loss": 0.11198491603136063, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "distill_loss": 0.15241265296936035, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "ref_ce_loss": 0.07542113214731216, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "loss": 0.48284369707107544, + "step": 17870 + }, + { + "ce_loss": 0.12855391204357147, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "distill_loss": 0.21619918942451477, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "ref_ce_loss": 0.07777365297079086, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "loss": 0.3478761613368988, + "step": 17870 + }, + { + "ce_loss": 0.0799589678645134, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "distill_loss": 0.09983595460653305, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "ref_ce_loss": 0.07422303408384323, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "loss": 0.36284440755844116, + "step": 17870 + }, + { + "ce_loss": 0.05180178955197334, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "distill_loss": 0.11064992845058441, + "epoch": 5.960640426951301, + "step": 17870 + }, + { + "epoch": 5.960640426951301, + "ref_ce_loss": 0.06782492250204086, + "step": 17870 + }, + { + "epoch": 5.963975983989326, + "loss": 0.3438, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "grad_norm": 2.341897964477539, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "learning_rate": 4.8138246733894924e-05, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "loss": 0.2747371792793274, + "step": 17880 + }, + { + "ce_loss": 0.05753449723124504, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "distill_loss": 0.0984141156077385, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "ref_ce_loss": 0.046176254749298096, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "loss": 0.2829621434211731, + "step": 17880 + }, + { + "ce_loss": 0.061033353209495544, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "distill_loss": 0.12907133996486664, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "ref_ce_loss": 0.061487022787332535, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "loss": 0.3641863465309143, + "step": 17880 + }, + { + "ce_loss": 0.01116390060633421, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "distill_loss": 0.11722405254840851, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "ref_ce_loss": 0.06099073961377144, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "loss": 0.5954475998878479, + "step": 17880 + }, + { + "ce_loss": 0.1188012957572937, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "distill_loss": 0.1340787261724472, + "epoch": 5.963975983989326, + "step": 17880 + }, + { + "epoch": 5.963975983989326, + "ref_ce_loss": 0.06728996336460114, + "step": 17880 + }, + { + "epoch": 5.967311541027351, + "loss": 0.3917, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "grad_norm": 3.270078659057617, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "learning_rate": 4.798964624429801e-05, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "loss": 0.3070351481437683, + "step": 17890 + }, + { + "ce_loss": 0.02258501574397087, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "distill_loss": 0.20282799005508423, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "ref_ce_loss": 0.05928156152367592, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "loss": 0.37176916003227234, + "step": 17890 + }, + { + "ce_loss": 0.07661992311477661, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "distill_loss": 0.17923808097839355, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "ref_ce_loss": 0.07780885696411133, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "loss": 0.26655101776123047, + "step": 17890 + }, + { + "ce_loss": 0.04947191849350929, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "distill_loss": 0.11034003645181656, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "ref_ce_loss": 0.0893992930650711, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "loss": 0.37074536085128784, + "step": 17890 + }, + { + "ce_loss": 0.08786562830209732, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "distill_loss": 0.1649412363767624, + "epoch": 5.967311541027351, + "step": 17890 + }, + { + "epoch": 5.967311541027351, + "ref_ce_loss": 0.07530762255191803, + "step": 17890 + }, + { + "epoch": 5.970647098065377, + "loss": 0.3688, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "grad_norm": 21.496910095214844, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "learning_rate": 4.784123178134653e-05, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "loss": 0.4974766969680786, + "step": 17900 + }, + { + "ce_loss": 0.022632336243987083, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "distill_loss": 0.1455262154340744, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "ref_ce_loss": 0.12247674912214279, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "loss": 1.078402042388916, + "step": 17900 + }, + { + "ce_loss": 0.1272539347410202, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "distill_loss": 0.21468423306941986, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "ref_ce_loss": 0.08153967559337616, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "loss": 0.2720794677734375, + "step": 17900 + }, + { + "ce_loss": 0.05145162716507912, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "distill_loss": 0.1308475136756897, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "ref_ce_loss": 0.06132218986749649, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "loss": 0.4849904179573059, + "step": 17900 + }, + { + "ce_loss": 0.03414897248148918, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "distill_loss": 0.18564105033874512, + "epoch": 5.970647098065377, + "step": 17900 + }, + { + "epoch": 5.970647098065377, + "ref_ce_loss": 0.07281206548213959, + "step": 17900 + }, + { + "epoch": 5.973982655103402, + "loss": 0.3864, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "grad_norm": 2.804548501968384, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "learning_rate": 4.769300361568994e-05, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "loss": 0.31776753067970276, + "step": 17910 + }, + { + "ce_loss": 0.0540342815220356, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "distill_loss": 0.11750628054141998, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "ref_ce_loss": 0.08454766869544983, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "loss": 0.4837797284126282, + "step": 17910 + }, + { + "ce_loss": 0.06660284101963043, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "distill_loss": 0.2543158233165741, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "ref_ce_loss": 0.06508289277553558, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "loss": 0.2823173999786377, + "step": 17910 + }, + { + "ce_loss": 0.0301041379570961, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "distill_loss": 0.14463257789611816, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "ref_ce_loss": 0.053473278880119324, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "loss": 0.2954986095428467, + "step": 17910 + }, + { + "ce_loss": 0.07780736684799194, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "distill_loss": 0.11172517389059067, + "epoch": 5.973982655103402, + "step": 17910 + }, + { + "epoch": 5.973982655103402, + "ref_ce_loss": 0.07242323458194733, + "step": 17910 + }, + { + "epoch": 5.9773182121414274, + "loss": 0.3964, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "grad_norm": 2.323547601699829, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "learning_rate": 4.7544962017638e-05, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "loss": 0.725130558013916, + "step": 17920 + }, + { + "ce_loss": 0.16385790705680847, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "distill_loss": 0.21144914627075195, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "ref_ce_loss": 0.13433006405830383, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "loss": 0.720917820930481, + "step": 17920 + }, + { + "ce_loss": 0.058357104659080505, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "distill_loss": 0.21775387227535248, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "ref_ce_loss": 0.13213202357292175, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "loss": 0.3602050542831421, + "step": 17920 + }, + { + "ce_loss": 0.019712118431925774, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "distill_loss": 0.08597089350223541, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "ref_ce_loss": 0.06011264771223068, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "loss": 0.24541306495666504, + "step": 17920 + }, + { + "ce_loss": 0.056333962827920914, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "distill_loss": 0.12353299558162689, + "epoch": 5.9773182121414274, + "step": 17920 + }, + { + "epoch": 5.9773182121414274, + "ref_ce_loss": 0.05609600991010666, + "step": 17920 + }, + { + "epoch": 5.980653769179453, + "loss": 0.3587, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "grad_norm": 2.24729323387146, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "learning_rate": 4.7397107257160056e-05, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "loss": 0.4122750163078308, + "step": 17930 + }, + { + "ce_loss": 0.07864921540021896, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "distill_loss": 0.20610211789608002, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "ref_ce_loss": 0.07581673562526703, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "loss": 0.2803608775138855, + "step": 17930 + }, + { + "ce_loss": 0.053923893719911575, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "distill_loss": 0.14501123130321503, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "ref_ce_loss": 0.08128301799297333, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "loss": 0.3348112106323242, + "step": 17930 + }, + { + "ce_loss": 0.05133431404829025, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "distill_loss": 0.1559540331363678, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "ref_ce_loss": 0.06095059961080551, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "loss": 0.6578578352928162, + "step": 17930 + }, + { + "ce_loss": 0.026385093107819557, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "distill_loss": 0.13736410439014435, + "epoch": 5.980653769179453, + "step": 17930 + }, + { + "epoch": 5.980653769179453, + "ref_ce_loss": 0.06968227028846741, + "step": 17930 + }, + { + "epoch": 5.983989326217478, + "loss": 0.3642, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "grad_norm": 2.0110793113708496, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "learning_rate": 4.724943960388499e-05, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "loss": 0.3668304681777954, + "step": 17940 + }, + { + "ce_loss": 0.08051525801420212, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "distill_loss": 0.17716766893863678, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "ref_ce_loss": 0.07459532469511032, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "loss": 0.3955934941768646, + "step": 17940 + }, + { + "ce_loss": 0.007330432068556547, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "distill_loss": 0.1856948286294937, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "ref_ce_loss": 0.08589158952236176, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "loss": 0.4008892774581909, + "step": 17940 + }, + { + "ce_loss": 0.12865528464317322, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "distill_loss": 0.17550432682037354, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "ref_ce_loss": 0.07676687091588974, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "loss": 0.5108517408370972, + "step": 17940 + }, + { + "ce_loss": 0.05577657371759415, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "distill_loss": 0.29523903131484985, + "epoch": 5.983989326217478, + "step": 17940 + }, + { + "epoch": 5.983989326217478, + "ref_ce_loss": 0.07486500591039658, + "step": 17940 + }, + { + "epoch": 5.9873248832555035, + "loss": 0.411, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "grad_norm": 5.392819881439209, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "learning_rate": 4.7101959327100216e-05, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "loss": 0.29439449310302734, + "step": 17950 + }, + { + "ce_loss": 0.05010393634438515, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "distill_loss": 0.1031777560710907, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "ref_ce_loss": 0.054850682616233826, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "loss": 0.5090524554252625, + "step": 17950 + }, + { + "ce_loss": 0.11068634688854218, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "distill_loss": 0.11719664931297302, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "ref_ce_loss": 0.06820139288902283, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "loss": 0.6417162418365479, + "step": 17950 + }, + { + "ce_loss": 0.05354602634906769, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "distill_loss": 0.11590472608804703, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "ref_ce_loss": 0.08205898851156235, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "loss": 0.22045600414276123, + "step": 17950 + }, + { + "ce_loss": 0.03451256453990936, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "distill_loss": 0.1123829036951065, + "epoch": 5.9873248832555035, + "step": 17950 + }, + { + "epoch": 5.9873248832555035, + "ref_ce_loss": 0.04432832822203636, + "step": 17950 + }, + { + "epoch": 5.990660440293529, + "loss": 0.3739, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "grad_norm": 3.4776222705841064, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "learning_rate": 4.6954666695751704e-05, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "loss": 0.3324894607067108, + "step": 17960 + }, + { + "ce_loss": 0.04826672747731209, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "distill_loss": 0.11226597428321838, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "ref_ce_loss": 0.07892588526010513, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "loss": 0.36604809761047363, + "step": 17960 + }, + { + "ce_loss": 0.05619461089372635, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "distill_loss": 0.1991359293460846, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "ref_ce_loss": 0.0792030394077301, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "loss": 0.33557814359664917, + "step": 17960 + }, + { + "ce_loss": 0.05750252306461334, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "distill_loss": 0.1858070194721222, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "ref_ce_loss": 0.06949331611394882, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "loss": 0.27118319272994995, + "step": 17960 + }, + { + "ce_loss": 0.04408452659845352, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "distill_loss": 0.1736617088317871, + "epoch": 5.990660440293529, + "step": 17960 + }, + { + "epoch": 5.990660440293529, + "ref_ce_loss": 0.05310777202248573, + "step": 17960 + }, + { + "epoch": 5.993995997331554, + "loss": 0.3675, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "grad_norm": 2.418032169342041, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "learning_rate": 4.680756197844311e-05, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "loss": 0.2190331071615219, + "step": 17970 + }, + { + "ce_loss": 0.04840138927102089, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "distill_loss": 0.10126939415931702, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "ref_ce_loss": 0.046917498111724854, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "loss": 0.3270896077156067, + "step": 17970 + }, + { + "ce_loss": 0.04233918339014053, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "distill_loss": 0.18095675110816956, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "ref_ce_loss": 0.07469739764928818, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "loss": 0.30097100138664246, + "step": 17970 + }, + { + "ce_loss": 0.04831115901470184, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "distill_loss": 0.13550861179828644, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "ref_ce_loss": 0.08458152413368225, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "loss": 0.3853110671043396, + "step": 17970 + }, + { + "ce_loss": 0.10421506315469742, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "distill_loss": 0.21022917330265045, + "epoch": 5.993995997331554, + "step": 17970 + }, + { + "epoch": 5.993995997331554, + "ref_ce_loss": 0.053354986011981964, + "step": 17970 + }, + { + "epoch": 5.9973315543695795, + "loss": 0.3617, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "grad_norm": 2.6923153400421143, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "learning_rate": 4.666064544343535e-05, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "loss": 0.5322884321212769, + "step": 17980 + }, + { + "ce_loss": 0.07125137001276016, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "distill_loss": 0.2436763346195221, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "ref_ce_loss": 0.10013176500797272, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "loss": 0.32280588150024414, + "step": 17980 + }, + { + "ce_loss": 0.08425823599100113, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "distill_loss": 0.12138201296329498, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "ref_ce_loss": 0.0700070708990097, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "loss": 0.15443341434001923, + "step": 17980 + }, + { + "ce_loss": 0.008114497177302837, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "distill_loss": 0.09282103925943375, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "ref_ce_loss": 0.053100306540727615, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "loss": 0.35013052821159363, + "step": 17980 + }, + { + "ce_loss": 0.09281091392040253, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "distill_loss": 0.13264884054660797, + "epoch": 5.9973315543695795, + "step": 17980 + }, + { + "epoch": 5.9973315543695795, + "ref_ce_loss": 0.08823098242282867, + "step": 17980 + }, + { + "epoch": 6.000667111407605, + "loss": 0.3923, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "grad_norm": 3.768995523452759, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "learning_rate": 4.651391735864629e-05, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "loss": 0.35800161957740784, + "step": 17990 + }, + { + "ce_loss": 0.08932358771562576, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "distill_loss": 0.13139638304710388, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "ref_ce_loss": 0.06785690784454346, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "loss": 0.2316976636648178, + "step": 17990 + }, + { + "ce_loss": 0.03572269156575203, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "distill_loss": 0.13418623805046082, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "ref_ce_loss": 0.061637792736291885, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "loss": 0.31546854972839355, + "step": 17990 + }, + { + "ce_loss": 0.026578150689601898, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "distill_loss": 0.112409807741642, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "ref_ce_loss": 0.07530684024095535, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "loss": 0.2855393886566162, + "step": 17990 + }, + { + "ce_loss": 0.017660409212112427, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "distill_loss": 0.09330619871616364, + "epoch": 6.000667111407605, + "step": 17990 + }, + { + "epoch": 6.000667111407605, + "ref_ce_loss": 0.044164739549160004, + "step": 17990 + }, + { + "epoch": 6.00400266844563, + "loss": 0.3248, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "grad_norm": 2.2335541248321533, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "learning_rate": 4.636737799164998e-05, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "loss": 0.19867165386676788, + "step": 18000 + }, + { + "ce_loss": 0.03791828081011772, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "distill_loss": 0.09197042137384415, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "ref_ce_loss": 0.03162221238017082, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "loss": 0.3159444332122803, + "step": 18000 + }, + { + "ce_loss": 0.04408169165253639, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "distill_loss": 0.13018789887428284, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "ref_ce_loss": 0.07600803673267365, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "loss": 0.24588634073734283, + "step": 18000 + }, + { + "ce_loss": 0.02073906734585762, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "distill_loss": 0.11770112812519073, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "ref_ce_loss": 0.028640158474445343, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "loss": 0.5873534679412842, + "step": 18000 + }, + { + "ce_loss": 0.1087309941649437, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "distill_loss": 0.14516711235046387, + "epoch": 6.00400266844563, + "step": 18000 + }, + { + "epoch": 6.00400266844563, + "ref_ce_loss": 0.09350698441267014, + "step": 18000 + }, + { + "epoch": 6.007338225483656, + "loss": 0.3314, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "grad_norm": 3.0365161895751953, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "learning_rate": 4.622102760967644e-05, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "loss": 0.24367427825927734, + "step": 18010 + }, + { + "ce_loss": 0.03099408745765686, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "distill_loss": 0.11708693206310272, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "ref_ce_loss": 0.058985915035009384, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "loss": 0.24253971874713898, + "step": 18010 + }, + { + "ce_loss": 0.027155818417668343, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "distill_loss": 0.14921477437019348, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "ref_ce_loss": 0.025843849405646324, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "loss": 0.2067640721797943, + "step": 18010 + }, + { + "ce_loss": 0.018789952620863914, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "distill_loss": 0.13064438104629517, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "ref_ce_loss": 0.03466375172138214, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "loss": 0.3233734369277954, + "step": 18010 + }, + { + "ce_loss": 0.11963633447885513, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "distill_loss": 0.16788633167743683, + "epoch": 6.007338225483656, + "step": 18010 + }, + { + "epoch": 6.007338225483656, + "ref_ce_loss": 0.035782862454652786, + "step": 18010 + }, + { + "epoch": 6.010673782521681, + "loss": 0.3234, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "grad_norm": 4.069089889526367, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "learning_rate": 4.607486647961117e-05, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "loss": 0.36928215622901917, + "step": 18020 + }, + { + "ce_loss": 0.02707325853407383, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "distill_loss": 0.1534077674150467, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "ref_ce_loss": 0.07559659332036972, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "loss": 0.2386367917060852, + "step": 18020 + }, + { + "ce_loss": 0.03612204268574715, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "distill_loss": 0.11690287292003632, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "ref_ce_loss": 0.060044534504413605, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "loss": 0.20590835809707642, + "step": 18020 + }, + { + "ce_loss": 0.05326259881258011, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "distill_loss": 0.1046588271856308, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "ref_ce_loss": 0.03534253314137459, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "loss": 0.2644188106060028, + "step": 18020 + }, + { + "ce_loss": 0.02832714095711708, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "distill_loss": 0.16729630529880524, + "epoch": 6.010673782521681, + "step": 18020 + }, + { + "epoch": 6.010673782521681, + "ref_ce_loss": 0.03283698111772537, + "step": 18020 + }, + { + "epoch": 6.014009339559706, + "loss": 0.3222, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "grad_norm": 2.2713472843170166, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "learning_rate": 4.592889486799428e-05, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "loss": 0.3372799754142761, + "step": 18030 + }, + { + "ce_loss": 0.0459248311817646, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "distill_loss": 0.20176801085472107, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "ref_ce_loss": 0.06285598129034042, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "loss": 0.28023040294647217, + "step": 18030 + }, + { + "ce_loss": 0.057125724852085114, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "distill_loss": 0.15166716277599335, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "ref_ce_loss": 0.07121897488832474, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "loss": 0.1802394837141037, + "step": 18030 + }, + { + "ce_loss": 0.03540526703000069, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "distill_loss": 0.09024612605571747, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "ref_ce_loss": 0.044617220759391785, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "loss": 0.2588875889778137, + "step": 18030 + }, + { + "ce_loss": 0.03209130838513374, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "distill_loss": 0.12625159323215485, + "epoch": 6.014009339559706, + "step": 18030 + }, + { + "epoch": 6.014009339559706, + "ref_ce_loss": 0.07114759832620621, + "step": 18030 + }, + { + "epoch": 6.017344896597732, + "loss": 0.3416, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "grad_norm": 2.186108350753784, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "learning_rate": 4.578311304102043e-05, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "loss": 0.17702947556972504, + "step": 18040 + }, + { + "ce_loss": 0.03151837736368179, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "distill_loss": 0.09358489513397217, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "ref_ce_loss": 0.051804766058921814, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "loss": 0.5507766604423523, + "step": 18040 + }, + { + "ce_loss": 0.07824090123176575, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "distill_loss": 0.12588661909103394, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "ref_ce_loss": 0.055321402847766876, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "loss": 0.38365840911865234, + "step": 18040 + }, + { + "ce_loss": 0.0746174082159996, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "distill_loss": 0.183217391371727, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "ref_ce_loss": 0.06258904933929443, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "loss": 0.23493067920207977, + "step": 18040 + }, + { + "ce_loss": 0.04061503708362579, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "distill_loss": 0.14113374054431915, + "epoch": 6.017344896597732, + "step": 18040 + }, + { + "epoch": 6.017344896597732, + "ref_ce_loss": 0.05288606137037277, + "step": 18040 + }, + { + "epoch": 6.020680453635757, + "loss": 0.3774, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "grad_norm": 2.509274482727051, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "learning_rate": 4.5637521264538244e-05, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "loss": 0.22896793484687805, + "step": 18050 + }, + { + "ce_loss": 0.020151875913143158, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "distill_loss": 0.11503048986196518, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "ref_ce_loss": 0.0390886589884758, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "loss": 0.30240610241889954, + "step": 18050 + }, + { + "ce_loss": 0.029857726767659187, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "distill_loss": 0.2106308937072754, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "ref_ce_loss": 0.042414914816617966, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "loss": 0.3427516222000122, + "step": 18050 + }, + { + "ce_loss": 0.14109061658382416, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "distill_loss": 0.13355794548988342, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "ref_ce_loss": 0.0491449311375618, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "loss": 0.4403334856033325, + "step": 18050 + }, + { + "ce_loss": 0.037846773862838745, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "distill_loss": 0.20196445286273956, + "epoch": 6.020680453635757, + "step": 18050 + }, + { + "epoch": 6.020680453635757, + "ref_ce_loss": 0.06973356008529663, + "step": 18050 + }, + { + "epoch": 6.024016010673782, + "loss": 0.311, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "grad_norm": 2.0426642894744873, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "learning_rate": 4.549211980404959e-05, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "loss": 0.17637385427951813, + "step": 18060 + }, + { + "ce_loss": 0.021697957068681717, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "distill_loss": 0.11747463792562485, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "ref_ce_loss": 0.03694509342312813, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "loss": 0.273229718208313, + "step": 18060 + }, + { + "ce_loss": 0.014125284738838673, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "distill_loss": 0.08990029990673065, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "ref_ce_loss": 0.03305063769221306, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "loss": 0.4129598140716553, + "step": 18060 + }, + { + "ce_loss": 0.03795570880174637, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "distill_loss": 0.14061012864112854, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "ref_ce_loss": 0.06605387479066849, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "loss": 0.5871542692184448, + "step": 18060 + }, + { + "ce_loss": 0.06563892215490341, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "distill_loss": 0.146104097366333, + "epoch": 6.024016010673782, + "step": 18060 + }, + { + "epoch": 6.024016010673782, + "ref_ce_loss": 0.08084701746702194, + "step": 18060 + }, + { + "epoch": 6.027351567711808, + "loss": 0.3051, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "grad_norm": 1.981549620628357, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "learning_rate": 4.534690892470942e-05, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "loss": 0.45083296298980713, + "step": 18070 + }, + { + "ce_loss": 0.02970081754028797, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "distill_loss": 0.10548330098390579, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "ref_ce_loss": 0.06935655325651169, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "loss": 0.3245180547237396, + "step": 18070 + }, + { + "ce_loss": 0.05906673148274422, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "distill_loss": 0.156429722905159, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "ref_ce_loss": 0.08586925268173218, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "loss": 0.26633402705192566, + "step": 18070 + }, + { + "ce_loss": 0.05068255960941315, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "distill_loss": 0.13847847282886505, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "ref_ce_loss": 0.05850810185074806, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "loss": 0.38250064849853516, + "step": 18070 + }, + { + "ce_loss": 0.036539558321237564, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "distill_loss": 0.2482924610376358, + "epoch": 6.027351567711808, + "step": 18070 + }, + { + "epoch": 6.027351567711808, + "ref_ce_loss": 0.06506029516458511, + "step": 18070 + }, + { + "epoch": 6.030687124749833, + "loss": 0.2887, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "grad_norm": 2.2178709506988525, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "learning_rate": 4.52018888913251e-05, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "loss": 0.41285941004753113, + "step": 18080 + }, + { + "ce_loss": 0.10323239117860794, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "distill_loss": 0.14549994468688965, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "ref_ce_loss": 0.051457539200782776, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "loss": 0.28614822030067444, + "step": 18080 + }, + { + "ce_loss": 0.06471443176269531, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "distill_loss": 0.14293956756591797, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "ref_ce_loss": 0.06065152958035469, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "loss": 0.3671509027481079, + "step": 18080 + }, + { + "ce_loss": 0.05830796808004379, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "distill_loss": 0.13328364491462708, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "ref_ce_loss": 0.040838442742824554, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "loss": 0.18215957283973694, + "step": 18080 + }, + { + "ce_loss": 0.030505971983075142, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "distill_loss": 0.10520057380199432, + "epoch": 6.030687124749833, + "step": 18080 + }, + { + "epoch": 6.030687124749833, + "ref_ce_loss": 0.03815292939543724, + "step": 18080 + }, + { + "epoch": 6.034022681787858, + "loss": 0.3066, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "grad_norm": 2.430351734161377, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "learning_rate": 4.505705996835596e-05, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "loss": 0.28383922576904297, + "step": 18090 + }, + { + "ce_loss": 0.05320765823125839, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "distill_loss": 0.13925553858280182, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "ref_ce_loss": 0.032751839607954025, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "loss": 0.2511315941810608, + "step": 18090 + }, + { + "ce_loss": 0.0436200387775898, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "distill_loss": 0.11578132212162018, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "ref_ce_loss": 0.073519267141819, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "loss": 0.26703473925590515, + "step": 18090 + }, + { + "ce_loss": 0.05209790915250778, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "distill_loss": 0.09003359824419022, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "ref_ce_loss": 0.08476582914590836, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "loss": 0.24599985778331757, + "step": 18090 + }, + { + "ce_loss": 0.07031101733446121, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "distill_loss": 0.10161477327346802, + "epoch": 6.034022681787858, + "step": 18090 + }, + { + "epoch": 6.034022681787858, + "ref_ce_loss": 0.029595762491226196, + "step": 18090 + }, + { + "epoch": 6.037358238825884, + "loss": 0.3341, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "grad_norm": 2.7874391078948975, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "learning_rate": 4.491242241991286e-05, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "loss": 0.4746611416339874, + "step": 18100 + }, + { + "ce_loss": 0.04059432074427605, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "distill_loss": 0.12545041739940643, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "ref_ce_loss": 0.07350048422813416, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "loss": 0.29115569591522217, + "step": 18100 + }, + { + "ce_loss": 0.043459825217723846, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "distill_loss": 0.17026124894618988, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "ref_ce_loss": 0.05982349440455437, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "loss": 0.32593053579330444, + "step": 18100 + }, + { + "ce_loss": 0.05972723290324211, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "distill_loss": 0.14202600717544556, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "ref_ce_loss": 0.08605913072824478, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "loss": 0.1863568127155304, + "step": 18100 + }, + { + "ce_loss": 0.027750154957175255, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "distill_loss": 0.08981994539499283, + "epoch": 6.037358238825884, + "step": 18100 + }, + { + "epoch": 6.037358238825884, + "ref_ce_loss": 0.03250458091497421, + "step": 18100 + }, + { + "epoch": 6.040693795863909, + "loss": 0.3122, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "grad_norm": 2.810276746749878, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "learning_rate": 4.4767976509757563e-05, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "loss": 0.20042987167835236, + "step": 18110 + }, + { + "ce_loss": 0.014375542290508747, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "distill_loss": 0.11172399669885635, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "ref_ce_loss": 0.05216660350561142, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "loss": 0.23054620623588562, + "step": 18110 + }, + { + "ce_loss": 0.020065613090991974, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "distill_loss": 0.11909779906272888, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "ref_ce_loss": 0.05230550840497017, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "loss": 0.27268943190574646, + "step": 18110 + }, + { + "ce_loss": 0.06317111849784851, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "distill_loss": 0.12656812369823456, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "ref_ce_loss": 0.08271820843219757, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "loss": 0.28299564123153687, + "step": 18110 + }, + { + "ce_loss": 0.07387886941432953, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "distill_loss": 0.1497463583946228, + "epoch": 6.040693795863909, + "step": 18110 + }, + { + "epoch": 6.040693795863909, + "ref_ce_loss": 0.0429847314953804, + "step": 18110 + }, + { + "epoch": 6.044029352901934, + "loss": 0.3004, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "grad_norm": 2.7142744064331055, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "learning_rate": 4.462372250130247e-05, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "loss": 0.3670087456703186, + "step": 18120 + }, + { + "ce_loss": 0.06371357291936874, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "distill_loss": 0.1704612523317337, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "ref_ce_loss": 0.1321641057729721, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "loss": 0.2539289891719818, + "step": 18120 + }, + { + "ce_loss": 0.047485459595918655, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "distill_loss": 0.11303704977035522, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "ref_ce_loss": 0.06555590778589249, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "loss": 0.22901363670825958, + "step": 18120 + }, + { + "ce_loss": 0.052790023386478424, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "distill_loss": 0.10688269138336182, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "ref_ce_loss": 0.0471915639936924, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "loss": 0.5411761999130249, + "step": 18120 + }, + { + "ce_loss": 0.038415033370256424, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "distill_loss": 0.08298727869987488, + "epoch": 6.044029352901934, + "step": 18120 + }, + { + "epoch": 6.044029352901934, + "ref_ce_loss": 0.036232996731996536, + "step": 18120 + }, + { + "epoch": 6.04736490993996, + "loss": 0.3601, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "grad_norm": 4.615497589111328, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "learning_rate": 4.447966065760997e-05, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "loss": 0.33229541778564453, + "step": 18130 + }, + { + "ce_loss": 0.046142105013132095, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "distill_loss": 0.16430698335170746, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "ref_ce_loss": 0.0699513629078865, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "loss": 0.44384586811065674, + "step": 18130 + }, + { + "ce_loss": 0.028619125485420227, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "distill_loss": 0.14834138751029968, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "ref_ce_loss": 0.042568981647491455, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "loss": 0.5875935554504395, + "step": 18130 + }, + { + "ce_loss": 0.014686590991914272, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "distill_loss": 0.12745922803878784, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "ref_ce_loss": 0.10846404731273651, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "loss": 0.2178553193807602, + "step": 18130 + }, + { + "ce_loss": 0.05181694030761719, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "distill_loss": 0.1011785939335823, + "epoch": 6.04736490993996, + "step": 18130 + }, + { + "epoch": 6.04736490993996, + "ref_ce_loss": 0.04709342122077942, + "step": 18130 + }, + { + "epoch": 6.050700466977985, + "loss": 0.3242, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "grad_norm": 1.946622371673584, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "learning_rate": 4.433579124139206e-05, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "loss": 0.25335797667503357, + "step": 18140 + }, + { + "ce_loss": 0.0676303505897522, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "distill_loss": 0.15347588062286377, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "ref_ce_loss": 0.03215770795941353, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "loss": 0.40435728430747986, + "step": 18140 + }, + { + "ce_loss": 0.0626855120062828, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "distill_loss": 0.1381070762872696, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "ref_ce_loss": 0.08322049677371979, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "loss": 0.3568632900714874, + "step": 18140 + }, + { + "ce_loss": 0.11419306695461273, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "distill_loss": 0.13917380571365356, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "ref_ce_loss": 0.07467635720968246, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "loss": 0.27939581871032715, + "step": 18140 + }, + { + "ce_loss": 0.012311798520386219, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "distill_loss": 0.16337209939956665, + "epoch": 6.050700466977985, + "step": 18140 + }, + { + "epoch": 6.050700466977985, + "ref_ce_loss": 0.06114046275615692, + "step": 18140 + }, + { + "epoch": 6.0540360240160105, + "loss": 0.3313, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "grad_norm": 2.934711217880249, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "learning_rate": 4.419211451500986e-05, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "loss": 0.3649110198020935, + "step": 18150 + }, + { + "ce_loss": 0.08198490738868713, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "distill_loss": 0.13583259284496307, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "ref_ce_loss": 0.06769489496946335, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "loss": 0.20053881406784058, + "step": 18150 + }, + { + "ce_loss": 0.031547416001558304, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "distill_loss": 0.12106670439243317, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "ref_ce_loss": 0.03251959756016731, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "loss": 0.19803735613822937, + "step": 18150 + }, + { + "ce_loss": 0.013310973532497883, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "distill_loss": 0.09719417989253998, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "ref_ce_loss": 0.05349541828036308, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "loss": 0.22628724575042725, + "step": 18150 + }, + { + "ce_loss": 0.02451532892882824, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "distill_loss": 0.11430226266384125, + "epoch": 6.0540360240160105, + "step": 18150 + }, + { + "epoch": 6.0540360240160105, + "ref_ce_loss": 0.04020338132977486, + "step": 18150 + }, + { + "epoch": 6.057371581054036, + "loss": 0.2857, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "grad_norm": 2.198643445968628, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "learning_rate": 4.4048630740472915e-05, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "loss": 0.3034615218639374, + "step": 18160 + }, + { + "ce_loss": 0.03148229420185089, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "distill_loss": 0.10916534811258316, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "ref_ce_loss": 0.055936042219400406, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "loss": 0.1808771938085556, + "step": 18160 + }, + { + "ce_loss": 0.01604457013309002, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "distill_loss": 0.09643196314573288, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "ref_ce_loss": 0.04746682941913605, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "loss": 0.3949342370033264, + "step": 18160 + }, + { + "ce_loss": 0.07888796180486679, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "distill_loss": 0.11194111406803131, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "ref_ce_loss": 0.06055905297398567, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "loss": 0.3000640571117401, + "step": 18160 + }, + { + "ce_loss": 0.10470689833164215, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "distill_loss": 0.13388051092624664, + "epoch": 6.057371581054036, + "step": 18160 + }, + { + "epoch": 6.057371581054036, + "ref_ce_loss": 0.06086868792772293, + "step": 18160 + }, + { + "epoch": 6.060707138092061, + "loss": 0.3392, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "grad_norm": 2.10493803024292, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "learning_rate": 4.390534017943911e-05, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "loss": 0.5440735816955566, + "step": 18170 + }, + { + "ce_loss": 0.07138556241989136, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "distill_loss": 0.15884366631507874, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "ref_ce_loss": 0.05661080777645111, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "loss": 0.46446603536605835, + "step": 18170 + }, + { + "ce_loss": 0.05201159417629242, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "distill_loss": 0.1514742076396942, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "ref_ce_loss": 0.042757753282785416, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "loss": 0.1995864063501358, + "step": 18170 + }, + { + "ce_loss": 0.02640986256301403, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "distill_loss": 0.10817944258451462, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "ref_ce_loss": 0.06455156952142715, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "loss": 0.26589059829711914, + "step": 18170 + }, + { + "ce_loss": 0.035571422427892685, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "distill_loss": 0.1171257495880127, + "epoch": 6.060707138092061, + "step": 18170 + }, + { + "epoch": 6.060707138092061, + "ref_ce_loss": 0.07539810985326767, + "step": 18170 + }, + { + "epoch": 6.0640426951300865, + "loss": 0.311, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "grad_norm": 2.6534509658813477, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "learning_rate": 4.376224309321388e-05, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "loss": 0.18756568431854248, + "step": 18180 + }, + { + "ce_loss": 0.02319115772843361, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "distill_loss": 0.09259458631277084, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "ref_ce_loss": 0.071564681828022, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "loss": 0.36065298318862915, + "step": 18180 + }, + { + "ce_loss": 0.10762543231248856, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "distill_loss": 0.13027355074882507, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "ref_ce_loss": 0.05452357977628708, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "loss": 0.26103997230529785, + "step": 18180 + }, + { + "ce_loss": 0.08527110517024994, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "distill_loss": 0.09655442833900452, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "ref_ce_loss": 0.05029728263616562, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "loss": 0.4683290123939514, + "step": 18180 + }, + { + "ce_loss": 0.03932412713766098, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "distill_loss": 0.09566164761781693, + "epoch": 6.0640426951300865, + "step": 18180 + }, + { + "epoch": 6.0640426951300865, + "ref_ce_loss": 0.036095380783081055, + "step": 18180 + }, + { + "epoch": 6.067378252168112, + "loss": 0.3035, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "grad_norm": 2.610994338989258, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "learning_rate": 4.361933974274987e-05, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "loss": 0.10692508518695831, + "step": 18190 + }, + { + "ce_loss": 0.0038241662550717592, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "distill_loss": 0.07537472993135452, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "ref_ce_loss": 0.02766880951821804, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "loss": 0.3644680976867676, + "step": 18190 + }, + { + "ce_loss": 0.08397339284420013, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "distill_loss": 0.14016035199165344, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "ref_ce_loss": 0.06778319925069809, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "loss": 0.22137466073036194, + "step": 18190 + }, + { + "ce_loss": 0.07818184047937393, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "distill_loss": 0.0896679237484932, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "ref_ce_loss": 0.05327446386218071, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "loss": 0.3335571885108948, + "step": 18190 + }, + { + "ce_loss": 0.04578216373920441, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "distill_loss": 0.13960577547550201, + "epoch": 6.067378252168112, + "step": 18190 + }, + { + "epoch": 6.067378252168112, + "ref_ce_loss": 0.06935045123100281, + "step": 18190 + }, + { + "epoch": 6.070713809206137, + "loss": 0.2786, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "grad_norm": 2.146139144897461, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "learning_rate": 4.347663038864648e-05, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "loss": 0.3073939085006714, + "step": 18200 + }, + { + "ce_loss": 0.07068096101284027, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "distill_loss": 0.15674972534179688, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "ref_ce_loss": 0.06139358878135681, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "loss": 0.7768551111221313, + "step": 18200 + }, + { + "ce_loss": 0.07332314550876617, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "distill_loss": 0.17987912893295288, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "ref_ce_loss": 0.08335179090499878, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "loss": 0.24506054818630219, + "step": 18200 + }, + { + "ce_loss": 0.01479860208928585, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "distill_loss": 0.1127048134803772, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "ref_ce_loss": 0.06586452573537827, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "loss": 0.31514981389045715, + "step": 18200 + }, + { + "ce_loss": 0.058193325996398926, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "distill_loss": 0.14873111248016357, + "epoch": 6.070713809206137, + "step": 18200 + }, + { + "epoch": 6.070713809206137, + "ref_ce_loss": 0.05019865557551384, + "step": 18200 + }, + { + "epoch": 6.074049366244163, + "loss": 0.3327, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "grad_norm": 3.6657485961914062, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "learning_rate": 4.3334115291149154e-05, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "loss": 0.21963410079479218, + "step": 18210 + }, + { + "ce_loss": 0.029777036979794502, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "distill_loss": 0.09637679904699326, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "ref_ce_loss": 0.042842797935009, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "loss": 0.35416334867477417, + "step": 18210 + }, + { + "ce_loss": 0.07847097516059875, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "distill_loss": 0.22109586000442505, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "ref_ce_loss": 0.05443606525659561, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "loss": 0.23724272847175598, + "step": 18210 + }, + { + "ce_loss": 0.02057749032974243, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "distill_loss": 0.12848436832427979, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "ref_ce_loss": 0.060044120997190475, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "loss": 0.5802603363990784, + "step": 18210 + }, + { + "ce_loss": 0.036357879638671875, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "distill_loss": 0.10860496759414673, + "epoch": 6.074049366244163, + "step": 18210 + }, + { + "epoch": 6.074049366244163, + "ref_ce_loss": 0.05457604303956032, + "step": 18210 + }, + { + "epoch": 6.077384923282188, + "loss": 0.3228, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "grad_norm": 2.0046796798706055, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "learning_rate": 4.31917947101493e-05, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "loss": 0.5089726448059082, + "step": 18220 + }, + { + "ce_loss": 0.06141791120171547, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "distill_loss": 0.14615316689014435, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "ref_ce_loss": 0.060697052627801895, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "loss": 0.29290029406547546, + "step": 18220 + }, + { + "ce_loss": 0.05762191489338875, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "distill_loss": 0.13641268014907837, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "ref_ce_loss": 0.05285864323377609, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "loss": 0.16176757216453552, + "step": 18220 + }, + { + "ce_loss": 0.011141808703541756, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "distill_loss": 0.0926661565899849, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "ref_ce_loss": 0.03927552327513695, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "loss": 0.24812178313732147, + "step": 18220 + }, + { + "ce_loss": 0.027461422607302666, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "distill_loss": 0.1357075273990631, + "epoch": 6.077384923282188, + "step": 18220 + }, + { + "epoch": 6.077384923282188, + "ref_ce_loss": 0.048979636281728745, + "step": 18220 + }, + { + "epoch": 6.080720480320213, + "loss": 0.3197, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "grad_norm": 2.6882028579711914, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "learning_rate": 4.304966890518349e-05, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "loss": 0.2826864421367645, + "step": 18230 + }, + { + "ce_loss": 0.06585119664669037, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "distill_loss": 0.1340772956609726, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "ref_ce_loss": 0.04505593702197075, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "loss": 0.29510295391082764, + "step": 18230 + }, + { + "ce_loss": 0.052019789814949036, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "distill_loss": 0.15474607050418854, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "ref_ce_loss": 0.05676916614174843, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "loss": 0.24109765887260437, + "step": 18230 + }, + { + "ce_loss": 0.007869729772210121, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "distill_loss": 0.087680384516716, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "ref_ce_loss": 0.042009759694337845, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "loss": 0.1544315367937088, + "step": 18230 + }, + { + "ce_loss": 0.02946949191391468, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "distill_loss": 0.09035389125347137, + "epoch": 6.080720480320213, + "step": 18230 + }, + { + "epoch": 6.080720480320213, + "ref_ce_loss": 0.03453512117266655, + "step": 18230 + }, + { + "epoch": 6.084056037358239, + "loss": 0.3303, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "grad_norm": 2.3786280155181885, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "learning_rate": 4.290773813543312e-05, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "loss": 0.5657137632369995, + "step": 18240 + }, + { + "ce_loss": 0.04016527906060219, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "distill_loss": 0.09664686024188995, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "ref_ce_loss": 0.06613688915967941, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "loss": 0.4780872166156769, + "step": 18240 + }, + { + "ce_loss": 0.08768974244594574, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "distill_loss": 0.23951031267642975, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "ref_ce_loss": 0.0796649381518364, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "loss": 0.21369272470474243, + "step": 18240 + }, + { + "ce_loss": 0.03242059797048569, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "distill_loss": 0.12135210633277893, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "ref_ce_loss": 0.05987042933702469, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "loss": 0.2175336629152298, + "step": 18240 + }, + { + "ce_loss": 0.03970927745103836, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "distill_loss": 0.1291152983903885, + "epoch": 6.084056037358239, + "step": 18240 + }, + { + "epoch": 6.084056037358239, + "ref_ce_loss": 0.04774701967835426, + "step": 18240 + }, + { + "epoch": 6.087391594396264, + "loss": 0.3106, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "grad_norm": 1.8230851888656616, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "learning_rate": 4.2766002659724014e-05, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "loss": 0.20345722138881683, + "step": 18250 + }, + { + "ce_loss": 0.055368226021528244, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "distill_loss": 0.11434651166200638, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "ref_ce_loss": 0.033585384488105774, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "loss": 0.3076462745666504, + "step": 18250 + }, + { + "ce_loss": 0.04119785875082016, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "distill_loss": 0.1454070806503296, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "ref_ce_loss": 0.09140428155660629, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "loss": 0.2772146463394165, + "step": 18250 + }, + { + "ce_loss": 0.04244215041399002, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "distill_loss": 0.11346837878227234, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "ref_ce_loss": 0.08478512614965439, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "loss": 0.437461256980896, + "step": 18250 + }, + { + "ce_loss": 0.05325673148036003, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "distill_loss": 0.12055876851081848, + "epoch": 6.087391594396264, + "step": 18250 + }, + { + "epoch": 6.087391594396264, + "ref_ce_loss": 0.044090624898672104, + "step": 18250 + }, + { + "epoch": 6.090727151434289, + "loss": 0.2982, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "grad_norm": 1.8949027061462402, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "learning_rate": 4.262446273652562e-05, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "loss": 0.2269206941127777, + "step": 18260 + }, + { + "ce_loss": 0.062054380774497986, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "distill_loss": 0.10284656286239624, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "ref_ce_loss": 0.04559963196516037, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "loss": 0.30982521176338196, + "step": 18260 + }, + { + "ce_loss": 0.054068103432655334, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "distill_loss": 0.15855297446250916, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "ref_ce_loss": 0.07700146734714508, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "loss": 0.23585665225982666, + "step": 18260 + }, + { + "ce_loss": 0.015442995354533195, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "distill_loss": 0.10868162661790848, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "ref_ce_loss": 0.041581302881240845, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "loss": 0.17389631271362305, + "step": 18260 + }, + { + "ce_loss": 0.015349296852946281, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "distill_loss": 0.11651057004928589, + "epoch": 6.090727151434289, + "step": 18260 + }, + { + "epoch": 6.090727151434289, + "ref_ce_loss": 0.041636236011981964, + "step": 18260 + }, + { + "epoch": 6.094062708472315, + "loss": 0.2793, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "grad_norm": 1.77875816822052, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "learning_rate": 4.248311862395103e-05, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "loss": 0.3750142753124237, + "step": 18270 + }, + { + "ce_loss": 0.09963914006948471, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "distill_loss": 0.21521353721618652, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "ref_ce_loss": 0.059947673231363297, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "loss": 0.3384113311767578, + "step": 18270 + }, + { + "ce_loss": 0.049503691494464874, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "distill_loss": 0.13446229696273804, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "ref_ce_loss": 0.08077162504196167, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "loss": 0.3235764503479004, + "step": 18270 + }, + { + "ce_loss": 0.07446154206991196, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "distill_loss": 0.1346776783466339, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "ref_ce_loss": 0.08494427055120468, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "loss": 0.8587777614593506, + "step": 18270 + }, + { + "ce_loss": 0.08527383208274841, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "distill_loss": 0.18738584220409393, + "epoch": 6.094062708472315, + "step": 18270 + }, + { + "epoch": 6.094062708472315, + "ref_ce_loss": 0.10342002660036087, + "step": 18270 + }, + { + "epoch": 6.09739826551034, + "loss": 0.306, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "grad_norm": 4.400006294250488, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "learning_rate": 4.234197057975615e-05, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "loss": 0.44186297059059143, + "step": 18280 + }, + { + "ce_loss": 0.08321335166692734, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "distill_loss": 0.27311789989471436, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "ref_ce_loss": 0.05534227564930916, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "loss": 0.3389902710914612, + "step": 18280 + }, + { + "ce_loss": 0.07443110644817352, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "distill_loss": 0.12935440242290497, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "ref_ce_loss": 0.08985836803913116, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "loss": 0.19584959745407104, + "step": 18280 + }, + { + "ce_loss": 0.014480430632829666, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "distill_loss": 0.08713266998529434, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "ref_ce_loss": 0.06017642840743065, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "loss": 0.22717109322547913, + "step": 18280 + }, + { + "ce_loss": 0.03948648273944855, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "distill_loss": 0.1253734976053238, + "epoch": 6.09739826551034, + "step": 18280 + }, + { + "epoch": 6.09739826551034, + "ref_ce_loss": 0.04686824604868889, + "step": 18280 + }, + { + "epoch": 6.100733822548365, + "loss": 0.3242, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "grad_norm": 1.7938190698623657, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "learning_rate": 4.2201018861339226e-05, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "loss": 0.26108479499816895, + "step": 18290 + }, + { + "ce_loss": 0.05302269011735916, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "distill_loss": 0.11189974844455719, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "ref_ce_loss": 0.05024619773030281, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "loss": 0.37917935848236084, + "step": 18290 + }, + { + "ce_loss": 0.04029357060790062, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "distill_loss": 0.12598711252212524, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "ref_ce_loss": 0.07380714267492294, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "loss": 0.32163190841674805, + "step": 18290 + }, + { + "ce_loss": 0.06296807527542114, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "distill_loss": 0.16332390904426575, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "ref_ce_loss": 0.09513794630765915, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "loss": 0.2830953598022461, + "step": 18290 + }, + { + "ce_loss": 0.0462760366499424, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "distill_loss": 0.15794722735881805, + "epoch": 6.100733822548365, + "step": 18290 + }, + { + "epoch": 6.100733822548365, + "ref_ce_loss": 0.05615334212779999, + "step": 18290 + }, + { + "epoch": 6.104069379586391, + "loss": 0.3294, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "grad_norm": 2.763949394226074, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "learning_rate": 4.2060263725740756e-05, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "loss": 0.392032265663147, + "step": 18300 + }, + { + "ce_loss": 0.09934784471988678, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "distill_loss": 0.20907564461231232, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "ref_ce_loss": 0.060490336269140244, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "loss": 0.18854185938835144, + "step": 18300 + }, + { + "ce_loss": 0.019186168909072876, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "distill_loss": 0.12396436184644699, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "ref_ce_loss": 0.04528871178627014, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "loss": 0.22943469882011414, + "step": 18300 + }, + { + "ce_loss": 0.04969404265284538, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "distill_loss": 0.10475605726242065, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "ref_ce_loss": 0.056972041726112366, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "loss": 0.24096515774726868, + "step": 18300 + }, + { + "ce_loss": 0.00777512276545167, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "distill_loss": 0.10689795762300491, + "epoch": 6.104069379586391, + "step": 18300 + }, + { + "epoch": 6.104069379586391, + "ref_ce_loss": 0.06155410408973694, + "step": 18300 + }, + { + "epoch": 6.107404936624416, + "loss": 0.3067, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "grad_norm": 2.4878458976745605, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "learning_rate": 4.191970542964245e-05, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "loss": 0.27044180035591125, + "step": 18310 + }, + { + "ce_loss": 0.022618383169174194, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "distill_loss": 0.11641081422567368, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "ref_ce_loss": 0.05938781052827835, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "loss": 0.2108479142189026, + "step": 18310 + }, + { + "ce_loss": 0.019511261954903603, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "distill_loss": 0.109732985496521, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "ref_ce_loss": 0.027610134333372116, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "loss": 0.6706960201263428, + "step": 18310 + }, + { + "ce_loss": 0.09902139008045197, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "distill_loss": 0.16243474185466766, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "ref_ce_loss": 0.06722650676965714, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "loss": 0.3304760754108429, + "step": 18310 + }, + { + "ce_loss": 0.0915578156709671, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "distill_loss": 0.1474878489971161, + "epoch": 6.107404936624416, + "step": 18310 + }, + { + "epoch": 6.107404936624416, + "ref_ce_loss": 0.04289526119828224, + "step": 18310 + }, + { + "epoch": 6.110740493662441, + "loss": 0.3309, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "grad_norm": 2.8061115741729736, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "learning_rate": 4.177934422936725e-05, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "loss": 0.21884343028068542, + "step": 18320 + }, + { + "ce_loss": 0.020756900310516357, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "distill_loss": 0.0981239303946495, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "ref_ce_loss": 0.05400104448199272, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "loss": 0.6406104564666748, + "step": 18320 + }, + { + "ce_loss": 0.060525428503751755, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "distill_loss": 0.15946635603904724, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "ref_ce_loss": 0.09081117808818817, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "loss": 0.19435235857963562, + "step": 18320 + }, + { + "ce_loss": 0.016969850286841393, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "distill_loss": 0.11550012975931168, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "ref_ce_loss": 0.04261741042137146, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "loss": 0.3633389472961426, + "step": 18320 + }, + { + "ce_loss": 0.028799138963222504, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "distill_loss": 0.2342735230922699, + "epoch": 6.110740493662441, + "step": 18320 + }, + { + "epoch": 6.110740493662441, + "ref_ce_loss": 0.07534074783325195, + "step": 18320 + }, + { + "epoch": 6.114076050700467, + "loss": 0.3231, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "grad_norm": 2.804523468017578, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "learning_rate": 4.163918038087865e-05, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "loss": 0.22182205319404602, + "step": 18330 + }, + { + "ce_loss": 0.04863829165697098, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "distill_loss": 0.10979770869016647, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "ref_ce_loss": 0.06329606473445892, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "loss": 0.19124485552310944, + "step": 18330 + }, + { + "ce_loss": 0.019544605165719986, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "distill_loss": 0.10157377272844315, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "ref_ce_loss": 0.045619696378707886, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "loss": 0.30762940645217896, + "step": 18330 + }, + { + "ce_loss": 0.023676371201872826, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "distill_loss": 0.09587828814983368, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "ref_ce_loss": 0.04057086259126663, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "loss": 0.26268070936203003, + "step": 18330 + }, + { + "ce_loss": 0.03551900386810303, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "distill_loss": 0.11628009378910065, + "epoch": 6.114076050700467, + "step": 18330 + }, + { + "epoch": 6.114076050700467, + "ref_ce_loss": 0.04114997386932373, + "step": 18330 + }, + { + "epoch": 6.117411607738492, + "loss": 0.3617, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "grad_norm": 3.2843000888824463, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "learning_rate": 4.149921413978014e-05, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "loss": 0.26372018456459045, + "step": 18340 + }, + { + "ce_loss": 0.022633861750364304, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "distill_loss": 0.1458633840084076, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "ref_ce_loss": 0.05152313411235809, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "loss": 0.29105961322784424, + "step": 18340 + }, + { + "ce_loss": 0.09380751103162766, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "distill_loss": 0.1378300040960312, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "ref_ce_loss": 0.059045396745204926, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "loss": 0.4499114751815796, + "step": 18340 + }, + { + "ce_loss": 0.045432500541210175, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "distill_loss": 0.14073380827903748, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "ref_ce_loss": 0.056476663798093796, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "loss": 0.19718751311302185, + "step": 18340 + }, + { + "ce_loss": 0.031024429947137833, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "distill_loss": 0.0994877964258194, + "epoch": 6.117411607738492, + "step": 18340 + }, + { + "epoch": 6.117411607738492, + "ref_ce_loss": 0.056632742285728455, + "step": 18340 + }, + { + "epoch": 6.1207471647765175, + "loss": 0.3108, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "grad_norm": 2.1846914291381836, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "learning_rate": 4.1359445761314926e-05, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "loss": 0.1581880897283554, + "step": 18350 + }, + { + "ce_loss": 0.01265860628336668, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "distill_loss": 0.09621373564004898, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "ref_ce_loss": 0.0492212139070034, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "loss": 0.3098943829536438, + "step": 18350 + }, + { + "ce_loss": 0.0733032152056694, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "distill_loss": 0.16024382412433624, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "ref_ce_loss": 0.05418863520026207, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "loss": 0.24440094828605652, + "step": 18350 + }, + { + "ce_loss": 0.04196551814675331, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "distill_loss": 0.1325785517692566, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "ref_ce_loss": 0.06961900740861893, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "loss": 0.22027157247066498, + "step": 18350 + }, + { + "ce_loss": 0.02423410303890705, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "distill_loss": 0.11101806163787842, + "epoch": 6.1207471647765175, + "step": 18350 + }, + { + "epoch": 6.1207471647765175, + "ref_ce_loss": 0.029094528406858444, + "step": 18350 + }, + { + "epoch": 6.124082721814543, + "loss": 0.3069, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "grad_norm": 3.5036215782165527, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "learning_rate": 4.1219875500365516e-05, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "loss": 0.17867614328861237, + "step": 18360 + }, + { + "ce_loss": 0.02497381530702114, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "distill_loss": 0.07437723875045776, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "ref_ce_loss": 0.04872718080878258, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "loss": 0.258394718170166, + "step": 18360 + }, + { + "ce_loss": 0.0596681647002697, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "distill_loss": 0.13986316323280334, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "ref_ce_loss": 0.0443265363574028, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "loss": 0.15350134670734406, + "step": 18360 + }, + { + "ce_loss": 0.015066183172166348, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "distill_loss": 0.10006022453308105, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "ref_ce_loss": 0.038168370723724365, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "loss": 0.2913787066936493, + "step": 18360 + }, + { + "ce_loss": 0.03667327016592026, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "distill_loss": 0.1875118911266327, + "epoch": 6.124082721814543, + "step": 18360 + }, + { + "epoch": 6.124082721814543, + "ref_ce_loss": 0.0670209750533104, + "step": 18360 + }, + { + "epoch": 6.127418278852568, + "loss": 0.2767, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "grad_norm": 2.7282140254974365, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "learning_rate": 4.108050361145291e-05, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "loss": 0.27476048469543457, + "step": 18370 + }, + { + "ce_loss": 0.036131396889686584, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "distill_loss": 0.1398783028125763, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "ref_ce_loss": 0.059157684445381165, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "loss": 0.2925405502319336, + "step": 18370 + }, + { + "ce_loss": 0.024877002462744713, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "distill_loss": 0.16725745797157288, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "ref_ce_loss": 0.06880000978708267, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "loss": 0.7054593563079834, + "step": 18370 + }, + { + "ce_loss": 0.09368545562028885, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "distill_loss": 0.1227211132645607, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "ref_ce_loss": 0.048069968819618225, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "loss": 0.7590700387954712, + "step": 18370 + }, + { + "ce_loss": 0.07026736438274384, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "distill_loss": 0.19734793901443481, + "epoch": 6.127418278852568, + "step": 18370 + }, + { + "epoch": 6.127418278852568, + "ref_ce_loss": 0.03684297204017639, + "step": 18370 + }, + { + "epoch": 6.1307538358905935, + "loss": 0.3519, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "grad_norm": 2.439100742340088, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "learning_rate": 4.0941330348736525e-05, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "loss": 0.20850054919719696, + "step": 18380 + }, + { + "ce_loss": 0.03334977477788925, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "distill_loss": 0.11176758259534836, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "ref_ce_loss": 0.04599212110042572, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "loss": 0.3658720552921295, + "step": 18380 + }, + { + "ce_loss": 0.055781181901693344, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "distill_loss": 0.21307574212551117, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "ref_ce_loss": 0.04664365574717522, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "loss": 0.30071496963500977, + "step": 18380 + }, + { + "ce_loss": 0.03584017977118492, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "distill_loss": 0.13493163883686066, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "ref_ce_loss": 0.06915021687746048, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "loss": 0.26241907477378845, + "step": 18380 + }, + { + "ce_loss": 0.049552708864212036, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "distill_loss": 0.14699327945709229, + "epoch": 6.1307538358905935, + "step": 18380 + }, + { + "epoch": 6.1307538358905935, + "ref_ce_loss": 0.06564631313085556, + "step": 18380 + }, + { + "epoch": 6.134089392928619, + "loss": 0.3589, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "grad_norm": 3.0384786128997803, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "learning_rate": 4.080235596601341e-05, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "loss": 0.24150526523590088, + "step": 18390 + }, + { + "ce_loss": 0.01162660401314497, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "distill_loss": 0.10201402008533478, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "ref_ce_loss": 0.055936913937330246, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "loss": 0.20581991970539093, + "step": 18390 + }, + { + "ce_loss": 0.0499425008893013, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "distill_loss": 0.10892849415540695, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "ref_ce_loss": 0.046872884035110474, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "loss": 0.4894862174987793, + "step": 18390 + }, + { + "ce_loss": 0.0837978646159172, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "distill_loss": 0.13355745375156403, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "ref_ce_loss": 0.06904693692922592, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "loss": 0.27033209800720215, + "step": 18390 + }, + { + "ce_loss": 0.0719723179936409, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "distill_loss": 0.1512235701084137, + "epoch": 6.134089392928619, + "step": 18390 + }, + { + "epoch": 6.134089392928619, + "ref_ce_loss": 0.035801418125629425, + "step": 18390 + }, + { + "epoch": 6.137424949966644, + "loss": 0.2963, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "grad_norm": 2.858560085296631, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "learning_rate": 4.0663580716718046e-05, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "loss": 0.2508600950241089, + "step": 18400 + }, + { + "ce_loss": 0.0432690791785717, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "distill_loss": 0.13280202448368073, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "ref_ce_loss": 0.06123881787061691, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "loss": 0.4933200776576996, + "step": 18400 + }, + { + "ce_loss": 0.05571569874882698, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "distill_loss": 0.18126748502254486, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "ref_ce_loss": 0.10519138723611832, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "loss": 0.2924741804599762, + "step": 18400 + }, + { + "ce_loss": 0.063815638422966, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "distill_loss": 0.15766538679599762, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "ref_ce_loss": 0.07054778188467026, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "loss": 0.23580388724803925, + "step": 18400 + }, + { + "ce_loss": 0.05556456372141838, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "distill_loss": 0.11789394915103912, + "epoch": 6.137424949966644, + "step": 18400 + }, + { + "epoch": 6.137424949966644, + "ref_ce_loss": 0.053790390491485596, + "step": 18400 + }, + { + "epoch": 6.14076050700467, + "loss": 0.2943, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "grad_norm": 1.7060301303863525, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "learning_rate": 4.052500485392176e-05, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "loss": 0.23654238879680634, + "step": 18410 + }, + { + "ce_loss": 0.06270783394575119, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "distill_loss": 0.10348555445671082, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "ref_ce_loss": 0.07024077326059341, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "loss": 0.2675868272781372, + "step": 18410 + }, + { + "ce_loss": 0.03235636278986931, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "distill_loss": 0.14261578023433685, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "ref_ce_loss": 0.07388745993375778, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "loss": 0.4724261462688446, + "step": 18410 + }, + { + "ce_loss": 0.12275946140289307, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "distill_loss": 0.22504441440105438, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "ref_ce_loss": 0.08914220333099365, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "loss": 0.21506237983703613, + "step": 18410 + }, + { + "ce_loss": 0.015775134786963463, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "distill_loss": 0.1167135238647461, + "epoch": 6.14076050700467, + "step": 18410 + }, + { + "epoch": 6.14076050700467, + "ref_ce_loss": 0.05251457169651985, + "step": 18410 + }, + { + "epoch": 6.144096064042695, + "loss": 0.3415, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "grad_norm": 2.5018398761749268, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "learning_rate": 4.038662863033226e-05, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "loss": 0.2518661320209503, + "step": 18420 + }, + { + "ce_loss": 0.04636304825544357, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "distill_loss": 0.1221461296081543, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "ref_ce_loss": 0.0639716386795044, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "loss": 0.1405506581068039, + "step": 18420 + }, + { + "ce_loss": 0.009569302201271057, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "distill_loss": 0.09424649178981781, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "ref_ce_loss": 0.036644306033849716, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "loss": 0.3834973871707916, + "step": 18420 + }, + { + "ce_loss": 0.08157524466514587, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "distill_loss": 0.12001895904541016, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "ref_ce_loss": 0.08984258025884628, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "loss": 0.34268125891685486, + "step": 18420 + }, + { + "ce_loss": 0.058754097670316696, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "distill_loss": 0.217606782913208, + "epoch": 6.144096064042695, + "step": 18420 + }, + { + "epoch": 6.144096064042695, + "ref_ce_loss": 0.04520254582166672, + "step": 18420 + }, + { + "epoch": 6.14743162108072, + "loss": 0.329, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "grad_norm": 2.1517646312713623, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "learning_rate": 4.024845229829323e-05, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "loss": 0.3015998303890228, + "step": 18430 + }, + { + "ce_loss": 0.062217775732278824, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "distill_loss": 0.131020188331604, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "ref_ce_loss": 0.06910224258899689, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "loss": 0.2663721442222595, + "step": 18430 + }, + { + "ce_loss": 0.036985091865062714, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "distill_loss": 0.13282321393489838, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "ref_ce_loss": 0.06474227458238602, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "loss": 0.278898686170578, + "step": 18430 + }, + { + "ce_loss": 0.07325167953968048, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "distill_loss": 0.14880669116973877, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "ref_ce_loss": 0.05667537450790405, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "loss": 0.28270652890205383, + "step": 18430 + }, + { + "ce_loss": 0.06054554879665375, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "distill_loss": 0.15518692135810852, + "epoch": 6.14743162108072, + "step": 18430 + }, + { + "epoch": 6.14743162108072, + "ref_ce_loss": 0.06662452220916748, + "step": 18430 + }, + { + "epoch": 6.150767178118746, + "loss": 0.3253, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "grad_norm": 2.1012563705444336, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "learning_rate": 4.0110476109783726e-05, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "loss": 0.36245232820510864, + "step": 18440 + }, + { + "ce_loss": 0.053363729268312454, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "distill_loss": 0.13131973147392273, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "ref_ce_loss": 0.09455405175685883, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "loss": 0.29794377088546753, + "step": 18440 + }, + { + "ce_loss": 0.03110991045832634, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "distill_loss": 0.12349390238523483, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "ref_ce_loss": 0.06310545653104782, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "loss": 0.3318029046058655, + "step": 18440 + }, + { + "ce_loss": 0.06962364166975021, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "distill_loss": 0.0969667062163353, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "ref_ce_loss": 0.10596779733896255, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "loss": 0.33284032344818115, + "step": 18440 + }, + { + "ce_loss": 0.042060963809490204, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "distill_loss": 0.11621900647878647, + "epoch": 6.150767178118746, + "step": 18440 + }, + { + "epoch": 6.150767178118746, + "ref_ce_loss": 0.04753798618912697, + "step": 18440 + }, + { + "epoch": 6.154102735156771, + "loss": 0.3369, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "grad_norm": 2.206515312194824, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "learning_rate": 3.997270031641791e-05, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "loss": 0.46877241134643555, + "step": 18450 + }, + { + "ce_loss": 0.08415161073207855, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "distill_loss": 0.1510000228881836, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "ref_ce_loss": 0.07368329912424088, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "loss": 0.25681328773498535, + "step": 18450 + }, + { + "ce_loss": 0.07156962156295776, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "distill_loss": 0.1276722401380539, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "ref_ce_loss": 0.05735990032553673, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "loss": 0.3189067840576172, + "step": 18450 + }, + { + "ce_loss": 0.04861884191632271, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "distill_loss": 0.13847175240516663, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "ref_ce_loss": 0.08016352355480194, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "loss": 0.26889464259147644, + "step": 18450 + }, + { + "ce_loss": 0.06599020957946777, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "distill_loss": 0.1534539759159088, + "epoch": 6.154102735156771, + "step": 18450 + }, + { + "epoch": 6.154102735156771, + "ref_ce_loss": 0.048720572143793106, + "step": 18450 + }, + { + "epoch": 6.157438292194796, + "loss": 0.296, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "grad_norm": 1.9747674465179443, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "learning_rate": 3.9835125169444485e-05, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "loss": 0.2924409806728363, + "step": 18460 + }, + { + "ce_loss": 0.04371657222509384, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "distill_loss": 0.14328262209892273, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "ref_ce_loss": 0.056337349116802216, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "loss": 0.39455586671829224, + "step": 18460 + }, + { + "ce_loss": 0.07246027141809464, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "distill_loss": 0.14171691238880157, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "ref_ce_loss": 0.08444765955209732, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "loss": 0.2808062434196472, + "step": 18460 + }, + { + "ce_loss": 0.037363942712545395, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "distill_loss": 0.17661446332931519, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "ref_ce_loss": 0.043351124972105026, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "loss": 0.27473172545433044, + "step": 18460 + }, + { + "ce_loss": 0.062011852860450745, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "distill_loss": 0.11762156337499619, + "epoch": 6.157438292194796, + "step": 18460 + }, + { + "epoch": 6.157438292194796, + "ref_ce_loss": 0.06052962690591812, + "step": 18460 + }, + { + "epoch": 6.160773849232822, + "loss": 0.3145, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "grad_norm": 2.9975457191467285, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "learning_rate": 3.9697750919746255e-05, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "loss": 0.8400434255599976, + "step": 18470 + }, + { + "ce_loss": 0.06479611247777939, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "distill_loss": 0.13007411360740662, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "ref_ce_loss": 0.0658654272556305, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "loss": 0.17636524140834808, + "step": 18470 + }, + { + "ce_loss": 0.019910158589482307, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "distill_loss": 0.08979976177215576, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "ref_ce_loss": 0.03886188566684723, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "loss": 0.30023443698883057, + "step": 18470 + }, + { + "ce_loss": 0.01413907390087843, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "distill_loss": 0.10442767292261124, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "ref_ce_loss": 0.04619559645652771, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "loss": 0.3421204686164856, + "step": 18470 + }, + { + "ce_loss": 0.06843894720077515, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "distill_loss": 0.13290368020534515, + "epoch": 6.160773849232822, + "step": 18470 + }, + { + "epoch": 6.160773849232822, + "ref_ce_loss": 0.09307733923196793, + "step": 18470 + }, + { + "epoch": 6.164109406270847, + "loss": 0.3195, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "grad_norm": 2.4326090812683105, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "learning_rate": 3.9560577817839664e-05, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "loss": 0.2917850613594055, + "step": 18480 + }, + { + "ce_loss": 0.06054378300905228, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "distill_loss": 0.12447196245193481, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "ref_ce_loss": 0.04743599891662598, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "loss": 0.25405004620552063, + "step": 18480 + }, + { + "ce_loss": 0.04941348731517792, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "distill_loss": 0.14396850764751434, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "ref_ce_loss": 0.049728021025657654, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "loss": 0.42917001247406006, + "step": 18480 + }, + { + "ce_loss": 0.018850069493055344, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "distill_loss": 0.16858190298080444, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "ref_ce_loss": 0.04189824312925339, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "loss": 0.37748169898986816, + "step": 18480 + }, + { + "ce_loss": 0.061456672847270966, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "distill_loss": 0.1580883413553238, + "epoch": 6.164109406270847, + "step": 18480 + }, + { + "epoch": 6.164109406270847, + "ref_ce_loss": 0.07029583305120468, + "step": 18480 + }, + { + "epoch": 6.167444963308872, + "loss": 0.3445, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "grad_norm": 2.8507888317108154, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "learning_rate": 3.942360611387438e-05, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "loss": 0.3325313627719879, + "step": 18490 + }, + { + "ce_loss": 0.049201712012290955, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "distill_loss": 0.12399528175592422, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "ref_ce_loss": 0.0797731950879097, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "loss": 0.2676965296268463, + "step": 18490 + }, + { + "ce_loss": 0.038861967623233795, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "distill_loss": 0.12871843576431274, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "ref_ce_loss": 0.06563683599233627, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "loss": 0.23812928795814514, + "step": 18490 + }, + { + "ce_loss": 0.04163096100091934, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "distill_loss": 0.13853895664215088, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "ref_ce_loss": 0.04453599825501442, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "loss": 0.20199045538902283, + "step": 18490 + }, + { + "ce_loss": 0.020122459158301353, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "distill_loss": 0.11555896699428558, + "epoch": 6.167444963308872, + "step": 18490 + }, + { + "epoch": 6.167444963308872, + "ref_ce_loss": 0.044083766639232635, + "step": 18490 + }, + { + "epoch": 6.170780520346898, + "loss": 0.3082, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "grad_norm": 5.568923473358154, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "learning_rate": 3.928683605763267e-05, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "loss": 0.35506847500801086, + "step": 18500 + }, + { + "ce_loss": 0.05164219066500664, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "distill_loss": 0.13775408267974854, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "ref_ce_loss": 0.05966204032301903, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "loss": 0.277235209941864, + "step": 18500 + }, + { + "ce_loss": 0.0458344966173172, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "distill_loss": 0.14046710729599, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "ref_ce_loss": 0.06634150445461273, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "loss": 0.26391613483428955, + "step": 18500 + }, + { + "ce_loss": 0.006200232543051243, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "distill_loss": 0.13295315206050873, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "ref_ce_loss": 0.03400679677724838, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "loss": 0.7266954779624939, + "step": 18500 + }, + { + "ce_loss": 0.05183488875627518, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "distill_loss": 0.13190777599811554, + "epoch": 6.170780520346898, + "step": 18500 + }, + { + "epoch": 6.170780520346898, + "ref_ce_loss": 0.06639143824577332, + "step": 18500 + }, + { + "epoch": 6.174116077384923, + "loss": 0.3219, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "grad_norm": 3.9889957904815674, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "learning_rate": 3.915026789852921e-05, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "loss": 0.281459778547287, + "step": 18510 + }, + { + "ce_loss": 0.04788195714354515, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "distill_loss": 0.1647387593984604, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "ref_ce_loss": 0.06857550889253616, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "loss": 0.3966805934906006, + "step": 18510 + }, + { + "ce_loss": 0.060191236436367035, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "distill_loss": 0.19976350665092468, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "ref_ce_loss": 0.07919872552156448, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "loss": 0.5716406106948853, + "step": 18510 + }, + { + "ce_loss": 0.04771149158477783, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "distill_loss": 0.11342655122280121, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "ref_ce_loss": 0.07589419186115265, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "loss": 0.19318389892578125, + "step": 18510 + }, + { + "ce_loss": 0.01549871638417244, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "distill_loss": 0.10186982154846191, + "epoch": 6.174116077384923, + "step": 18510 + }, + { + "epoch": 6.174116077384923, + "ref_ce_loss": 0.04499030485749245, + "step": 18510 + }, + { + "epoch": 6.177451634422948, + "loss": 0.3339, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "grad_norm": 3.996617078781128, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "learning_rate": 3.901390188561046e-05, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "loss": 0.2945614159107208, + "step": 18520 + }, + { + "ce_loss": 0.04369880631566048, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "distill_loss": 0.14246287941932678, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "ref_ce_loss": 0.07933271676301956, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "loss": 0.20383426547050476, + "step": 18520 + }, + { + "ce_loss": 0.005165026523172855, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "distill_loss": 0.12378333508968353, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "ref_ce_loss": 0.04908745735883713, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "loss": 0.25932228565216064, + "step": 18520 + }, + { + "ce_loss": 0.04456058144569397, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "distill_loss": 0.15556976199150085, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "ref_ce_loss": 0.059071388095617294, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "loss": 0.560404896736145, + "step": 18520 + }, + { + "ce_loss": 0.05589801073074341, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "distill_loss": 0.2018391489982605, + "epoch": 6.177451634422948, + "step": 18520 + }, + { + "epoch": 6.177451634422948, + "ref_ce_loss": 0.04681461676955223, + "step": 18520 + }, + { + "epoch": 6.180787191460974, + "loss": 0.3013, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "grad_norm": 2.298755407333374, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "learning_rate": 3.8877738267554214e-05, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "loss": 0.2826690077781677, + "step": 18530 + }, + { + "ce_loss": 0.027750806882977486, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "distill_loss": 0.09200765192508698, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "ref_ce_loss": 0.06503093987703323, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "loss": 0.326167494058609, + "step": 18530 + }, + { + "ce_loss": 0.046595703810453415, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "distill_loss": 0.15352821350097656, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "ref_ce_loss": 0.07511696964502335, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "loss": 0.26359185576438904, + "step": 18530 + }, + { + "ce_loss": 0.03271767497062683, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "distill_loss": 0.1250348687171936, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "ref_ce_loss": 0.05025747790932655, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "loss": 0.2011631727218628, + "step": 18530 + }, + { + "ce_loss": 0.005348066333681345, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "distill_loss": 0.10876020789146423, + "epoch": 6.180787191460974, + "step": 18530 + }, + { + "epoch": 6.180787191460974, + "ref_ce_loss": 0.04273238778114319, + "step": 18530 + }, + { + "epoch": 6.184122748498999, + "loss": 0.2862, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "grad_norm": 2.2494394779205322, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "learning_rate": 3.8741777292669276e-05, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "loss": 0.6624394655227661, + "step": 18540 + }, + { + "ce_loss": 0.032520171254873276, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "distill_loss": 0.18035253882408142, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "ref_ce_loss": 0.05871713161468506, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "loss": 0.2021605521440506, + "step": 18540 + }, + { + "ce_loss": 0.024922169744968414, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "distill_loss": 0.10917256772518158, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "ref_ce_loss": 0.06801718473434448, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "loss": 0.37601831555366516, + "step": 18540 + }, + { + "ce_loss": 0.04148799553513527, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "distill_loss": 0.12794163823127747, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "ref_ce_loss": 0.07430071383714676, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "loss": 0.6574068069458008, + "step": 18540 + }, + { + "ce_loss": 0.08162372559309006, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "distill_loss": 0.2035280168056488, + "epoch": 6.184122748498999, + "step": 18540 + }, + { + "epoch": 6.184122748498999, + "ref_ce_loss": 0.09160168468952179, + "step": 18540 + }, + { + "epoch": 6.1874583055370245, + "loss": 0.3284, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "grad_norm": 1.9440633058547974, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "learning_rate": 3.8606019208894725e-05, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "loss": 0.3212183117866516, + "step": 18550 + }, + { + "ce_loss": 0.09615284949541092, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "distill_loss": 0.13871519267559052, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "ref_ce_loss": 0.05264740064740181, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "loss": 0.7384791374206543, + "step": 18550 + }, + { + "ce_loss": 0.011780355125665665, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "distill_loss": 0.11695633828639984, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "ref_ce_loss": 0.07216423749923706, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "loss": 0.35503384470939636, + "step": 18550 + }, + { + "ce_loss": 0.015260746702551842, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "distill_loss": 0.12225035578012466, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "ref_ce_loss": 0.040481239557266235, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "loss": 0.3401867151260376, + "step": 18550 + }, + { + "ce_loss": 0.03045118972659111, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "distill_loss": 0.13645508885383606, + "epoch": 6.1874583055370245, + "step": 18550 + }, + { + "epoch": 6.1874583055370245, + "ref_ce_loss": 0.033003069460392, + "step": 18550 + }, + { + "epoch": 6.19079386257505, + "loss": 0.3385, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "grad_norm": 2.8384790420532227, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "learning_rate": 3.8470464263799824e-05, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "loss": 0.20757602155208588, + "step": 18560 + }, + { + "ce_loss": 0.021547259762883186, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "distill_loss": 0.1290460079908371, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "ref_ce_loss": 0.03783115744590759, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "loss": 0.2212572544813156, + "step": 18560 + }, + { + "ce_loss": 0.020918430760502815, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "distill_loss": 0.14228373765945435, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "ref_ce_loss": 0.0429234504699707, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "loss": 0.32608312368392944, + "step": 18560 + }, + { + "ce_loss": 0.05824834108352661, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "distill_loss": 0.12231164425611496, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "ref_ce_loss": 0.05168101191520691, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "loss": 0.3267654776573181, + "step": 18560 + }, + { + "ce_loss": 0.03888304904103279, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "distill_loss": 0.21609488129615784, + "epoch": 6.19079386257505, + "step": 18560 + }, + { + "epoch": 6.19079386257505, + "ref_ce_loss": 0.07168714702129364, + "step": 18560 + }, + { + "epoch": 6.194129419613075, + "loss": 0.3106, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "grad_norm": 2.127432346343994, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "learning_rate": 3.833511270458322e-05, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "loss": 0.3519989550113678, + "step": 18570 + }, + { + "ce_loss": 0.044699445366859436, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "distill_loss": 0.1321500837802887, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "ref_ce_loss": 0.09541293233633041, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "loss": 0.38467326760292053, + "step": 18570 + }, + { + "ce_loss": 0.11054366827011108, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "distill_loss": 0.18650639057159424, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "ref_ce_loss": 0.06626542657613754, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "loss": 0.19615034759044647, + "step": 18570 + }, + { + "ce_loss": 0.03159404918551445, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "distill_loss": 0.10044976323843002, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "ref_ce_loss": 0.04699920117855072, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "loss": 0.42845267057418823, + "step": 18570 + }, + { + "ce_loss": 0.10839418321847916, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "distill_loss": 0.2222888171672821, + "epoch": 6.194129419613075, + "step": 18570 + }, + { + "epoch": 6.194129419613075, + "ref_ce_loss": 0.09751333296298981, + "step": 18570 + }, + { + "epoch": 6.1974649766511005, + "loss": 0.3156, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "grad_norm": 5.873178482055664, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "learning_rate": 3.819996477807288e-05, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "loss": 0.38896477222442627, + "step": 18580 + }, + { + "ce_loss": 0.003704048926010728, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "distill_loss": 0.1404985636472702, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "ref_ce_loss": 0.06484709680080414, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "loss": 0.23296181857585907, + "step": 18580 + }, + { + "ce_loss": 0.028243107721209526, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "distill_loss": 0.1301645040512085, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "ref_ce_loss": 0.05618060752749443, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "loss": 0.3045993745326996, + "step": 18580 + }, + { + "ce_loss": 0.010499167256057262, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "distill_loss": 0.13495343923568726, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "ref_ce_loss": 0.08178295940160751, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "loss": 0.19076423346996307, + "step": 18580 + }, + { + "ce_loss": 0.014703667722642422, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "distill_loss": 0.11374014616012573, + "epoch": 6.1974649766511005, + "step": 18580 + }, + { + "epoch": 6.1974649766511005, + "ref_ce_loss": 0.062148548662662506, + "step": 18580 + }, + { + "epoch": 6.200800533689126, + "loss": 0.3399, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "grad_norm": 2.2601478099823, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "learning_rate": 3.8065020730725305e-05, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "loss": 0.3093893826007843, + "step": 18590 + }, + { + "ce_loss": 0.033915240317583084, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "distill_loss": 0.14568378031253815, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "ref_ce_loss": 0.06362231075763702, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "loss": 0.2616564631462097, + "step": 18590 + }, + { + "ce_loss": 0.08007301390171051, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "distill_loss": 0.1228436678647995, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "ref_ce_loss": 0.04259605333209038, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "loss": 0.29220378398895264, + "step": 18590 + }, + { + "ce_loss": 0.06992803514003754, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "distill_loss": 0.13908424973487854, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "ref_ce_loss": 0.06554904580116272, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "loss": 0.4914194345474243, + "step": 18590 + }, + { + "ce_loss": 0.04688086733222008, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "distill_loss": 0.23791128396987915, + "epoch": 6.200800533689126, + "step": 18590 + }, + { + "epoch": 6.200800533689126, + "ref_ce_loss": 0.07891146838665009, + "step": 18590 + }, + { + "epoch": 6.204136090727151, + "loss": 0.3287, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "grad_norm": 2.0706448554992676, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "learning_rate": 3.7930280808625136e-05, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "loss": 0.14727945625782013, + "step": 18600 + }, + { + "ce_loss": 0.011500568129122257, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "distill_loss": 0.07382229715585709, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "ref_ce_loss": 0.04233421012759209, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "loss": 0.6932018399238586, + "step": 18600 + }, + { + "ce_loss": 0.013805784285068512, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "distill_loss": 0.11833083629608154, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "ref_ce_loss": 0.06745176017284393, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "loss": 0.2108478993177414, + "step": 18600 + }, + { + "ce_loss": 0.043616849929094315, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "distill_loss": 0.11201032251119614, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "ref_ce_loss": 0.05514965206384659, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "loss": 0.27369120717048645, + "step": 18600 + }, + { + "ce_loss": 0.027406323701143265, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "distill_loss": 0.13315467536449432, + "epoch": 6.204136090727151, + "step": 18600 + }, + { + "epoch": 6.204136090727151, + "ref_ce_loss": 0.07010474056005478, + "step": 18600 + }, + { + "epoch": 6.207471647765177, + "loss": 0.3429, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "grad_norm": 2.937699556350708, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "learning_rate": 3.7795745257484875e-05, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "loss": 0.279723584651947, + "step": 18610 + }, + { + "ce_loss": 0.022853851318359375, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "distill_loss": 0.11497896909713745, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "ref_ce_loss": 0.05497308075428009, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "loss": 0.23577311635017395, + "step": 18610 + }, + { + "ce_loss": 0.0342148058116436, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "distill_loss": 0.09748812764883041, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "ref_ce_loss": 0.04478719085454941, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "loss": 0.20509502291679382, + "step": 18610 + }, + { + "ce_loss": 0.0038550468161702156, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "distill_loss": 0.1022908017039299, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "ref_ce_loss": 0.03298857808113098, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "loss": 0.28083717823028564, + "step": 18610 + }, + { + "ce_loss": 0.02645295299589634, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "distill_loss": 0.10292775928974152, + "epoch": 6.207471647765177, + "step": 18610 + }, + { + "epoch": 6.207471647765177, + "ref_ce_loss": 0.050342313945293427, + "step": 18610 + }, + { + "epoch": 6.210807204803202, + "loss": 0.3456, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "grad_norm": 2.887526273727417, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "learning_rate": 3.7661414322644326e-05, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "loss": 0.26322484016418457, + "step": 18620 + }, + { + "ce_loss": 0.05786026641726494, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "distill_loss": 0.11787180602550507, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "ref_ce_loss": 0.0689215213060379, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "loss": 0.571141242980957, + "step": 18620 + }, + { + "ce_loss": 0.05233941599726677, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "distill_loss": 0.14902040362358093, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "ref_ce_loss": 0.06474089622497559, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "loss": 0.4037763774394989, + "step": 18620 + }, + { + "ce_loss": 0.05231030285358429, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "distill_loss": 0.18944039940834045, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "ref_ce_loss": 0.0811677798628807, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "loss": 0.17138558626174927, + "step": 18620 + }, + { + "ce_loss": 0.0197373665869236, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "distill_loss": 0.08096668869256973, + "epoch": 6.210807204803202, + "step": 18620 + }, + { + "epoch": 6.210807204803202, + "ref_ce_loss": 0.052383799105882645, + "step": 18620 + }, + { + "epoch": 6.214142761841227, + "loss": 0.3277, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "grad_norm": 2.4172303676605225, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "learning_rate": 3.7527288249070034e-05, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "loss": 0.38156095147132874, + "step": 18630 + }, + { + "ce_loss": 0.1050412729382515, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "distill_loss": 0.13883177936077118, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "ref_ce_loss": 0.05640149861574173, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "loss": 0.455706387758255, + "step": 18630 + }, + { + "ce_loss": 0.057640500366687775, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "distill_loss": 0.13849633932113647, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "ref_ce_loss": 0.05576207861304283, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "loss": 0.2372320145368576, + "step": 18630 + }, + { + "ce_loss": 0.04792383685708046, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "distill_loss": 0.11418038606643677, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "ref_ce_loss": 0.05012698099017143, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "loss": 0.23415352404117584, + "step": 18630 + }, + { + "ce_loss": 0.04273424670100212, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "distill_loss": 0.12337204068899155, + "epoch": 6.214142761841227, + "step": 18630 + }, + { + "epoch": 6.214142761841227, + "ref_ce_loss": 0.05350811779499054, + "step": 18630 + }, + { + "epoch": 6.217478318879253, + "loss": 0.3443, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "grad_norm": 3.1628241539001465, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "learning_rate": 3.739336728135519e-05, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "loss": 0.22019442915916443, + "step": 18640 + }, + { + "ce_loss": 0.012670139782130718, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "distill_loss": 0.11298336833715439, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "ref_ce_loss": 0.06662783771753311, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "loss": 0.3011285960674286, + "step": 18640 + }, + { + "ce_loss": 0.05775625631213188, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "distill_loss": 0.15119341015815735, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "ref_ce_loss": 0.04562750086188316, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "loss": 0.3526672124862671, + "step": 18640 + }, + { + "ce_loss": 0.028817400336265564, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "distill_loss": 0.11797647178173065, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "ref_ce_loss": 0.052301883697509766, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "loss": 0.3002309203147888, + "step": 18640 + }, + { + "ce_loss": 0.016758514568209648, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "distill_loss": 0.15333794057369232, + "epoch": 6.217478318879253, + "step": 18640 + }, + { + "epoch": 6.217478318879253, + "ref_ce_loss": 0.05648922920227051, + "step": 18640 + }, + { + "epoch": 6.220813875917278, + "loss": 0.3011, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "grad_norm": 2.3673858642578125, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "learning_rate": 3.7259651663718684e-05, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "loss": 0.3125011622905731, + "step": 18650 + }, + { + "ce_loss": 0.046617768704891205, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "distill_loss": 0.1842920333147049, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "ref_ce_loss": 0.08146623522043228, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "loss": 0.3332507312297821, + "step": 18650 + }, + { + "ce_loss": 0.029593532904982567, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "distill_loss": 0.15803472697734833, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "ref_ce_loss": 0.053554873913526535, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "loss": 0.43323659896850586, + "step": 18650 + }, + { + "ce_loss": 0.021801965311169624, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "distill_loss": 0.18702299892902374, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "ref_ce_loss": 0.06735753268003464, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "loss": 0.38802433013916016, + "step": 18650 + }, + { + "ce_loss": 0.06311264634132385, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "distill_loss": 0.16197697818279266, + "epoch": 6.220813875917278, + "step": 18650 + }, + { + "epoch": 6.220813875917278, + "ref_ce_loss": 0.09785512834787369, + "step": 18650 + }, + { + "epoch": 6.224149432955303, + "loss": 0.3536, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "grad_norm": 2.0434508323669434, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "learning_rate": 3.71261416400051e-05, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "loss": 0.19896145164966583, + "step": 18660 + }, + { + "ce_loss": 0.021050764247775078, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "distill_loss": 0.12107603251934052, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "ref_ce_loss": 0.03518590331077576, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "loss": 0.342231810092926, + "step": 18660 + }, + { + "ce_loss": 0.04348944127559662, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "distill_loss": 0.17786476016044617, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "ref_ce_loss": 0.07756116986274719, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "loss": 0.22609391808509827, + "step": 18660 + }, + { + "ce_loss": 0.04094768688082695, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "distill_loss": 0.1311914473772049, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "ref_ce_loss": 0.028837168589234352, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "loss": 0.4177436828613281, + "step": 18660 + }, + { + "ce_loss": 0.1019946038722992, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "distill_loss": 0.1752627044916153, + "epoch": 6.224149432955303, + "step": 18660 + }, + { + "epoch": 6.224149432955303, + "ref_ce_loss": 0.0595175102353096, + "step": 18660 + }, + { + "epoch": 6.227484989993329, + "loss": 0.3315, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "grad_norm": 2.711292028427124, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "learning_rate": 3.699283745368412e-05, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "loss": 0.30362796783447266, + "step": 18670 + }, + { + "ce_loss": 0.0400133915245533, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "distill_loss": 0.1721460521221161, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "ref_ce_loss": 0.0534956268966198, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "loss": 0.2800109386444092, + "step": 18670 + }, + { + "ce_loss": 0.011257833801209927, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "distill_loss": 0.12428543716669083, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "ref_ce_loss": 0.05313796177506447, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "loss": 0.1609666347503662, + "step": 18670 + }, + { + "ce_loss": 0.02284008264541626, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "distill_loss": 0.07858970016241074, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "ref_ce_loss": 0.02820756286382675, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "loss": 0.20992793142795563, + "step": 18670 + }, + { + "ce_loss": 0.028739934787154198, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "distill_loss": 0.11657550185918808, + "epoch": 6.227484989993329, + "step": 18670 + }, + { + "epoch": 6.227484989993329, + "ref_ce_loss": 0.06436599791049957, + "step": 18670 + }, + { + "epoch": 6.230820547031354, + "loss": 0.2938, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "grad_norm": 2.294095277786255, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "learning_rate": 3.6859739347849884e-05, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "loss": 0.3429484963417053, + "step": 18680 + }, + { + "ce_loss": 0.06927967071533203, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "distill_loss": 0.17625319957733154, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "ref_ce_loss": 0.07992861419916153, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "loss": 0.4963828921318054, + "step": 18680 + }, + { + "ce_loss": 0.053090523928403854, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "distill_loss": 0.32863304018974304, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "ref_ce_loss": 0.08859732747077942, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "loss": 0.4164791703224182, + "step": 18680 + }, + { + "ce_loss": 0.05513370782136917, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "distill_loss": 0.2739354074001312, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "ref_ce_loss": 0.07069216668605804, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "loss": 0.1833503097295761, + "step": 18680 + }, + { + "ce_loss": 0.013990293256938457, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "distill_loss": 0.11608105152845383, + "epoch": 6.230820547031354, + "step": 18680 + }, + { + "epoch": 6.230820547031354, + "ref_ce_loss": 0.05309043079614639, + "step": 18680 + }, + { + "epoch": 6.234156104069379, + "loss": 0.3561, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "grad_norm": 2.279000759124756, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "learning_rate": 3.6726847565220895e-05, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "loss": 0.25862422585487366, + "step": 18690 + }, + { + "ce_loss": 0.022512095049023628, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "distill_loss": 0.1802578717470169, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "ref_ce_loss": 0.055745966732501984, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "loss": 0.18684840202331543, + "step": 18690 + }, + { + "ce_loss": 0.015279823914170265, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "distill_loss": 0.09220601618289948, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "ref_ce_loss": 0.05309228226542473, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "loss": 0.23141039907932281, + "step": 18690 + }, + { + "ce_loss": 0.030733682215213776, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "distill_loss": 0.1341591477394104, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "ref_ce_loss": 0.04584375396370888, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "loss": 0.43140465021133423, + "step": 18690 + }, + { + "ce_loss": 0.019533297047019005, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "distill_loss": 0.1933666169643402, + "epoch": 6.234156104069379, + "step": 18690 + }, + { + "epoch": 6.234156104069379, + "ref_ce_loss": 0.08564330637454987, + "step": 18690 + }, + { + "epoch": 6.237491661107405, + "loss": 0.3111, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "grad_norm": 5.9803466796875, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "learning_rate": 3.659416234813932e-05, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "loss": 0.22904738783836365, + "step": 18700 + }, + { + "ce_loss": 0.015067033469676971, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "distill_loss": 0.12943653762340546, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "ref_ce_loss": 0.04217130318284035, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "loss": 0.22289657592773438, + "step": 18700 + }, + { + "ce_loss": 0.016781846061348915, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "distill_loss": 0.1689828336238861, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "ref_ce_loss": 0.03697735071182251, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "loss": 0.534015417098999, + "step": 18700 + }, + { + "ce_loss": 0.08496256917715073, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "distill_loss": 0.2338809370994568, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "ref_ce_loss": 0.06475655734539032, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "loss": 0.27168723940849304, + "step": 18700 + }, + { + "ce_loss": 0.030710462480783463, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "distill_loss": 0.17783647775650024, + "epoch": 6.237491661107405, + "step": 18700 + }, + { + "epoch": 6.237491661107405, + "ref_ce_loss": 0.05239542946219444, + "step": 18700 + }, + { + "epoch": 6.24082721814543, + "loss": 0.3582, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "grad_norm": 2.7943332195281982, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "learning_rate": 3.64616839385707e-05, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "loss": 0.3018385171890259, + "step": 18710 + }, + { + "ce_loss": 0.007486680056899786, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "distill_loss": 0.13992555439472198, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "ref_ce_loss": 0.057693563401699066, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "loss": 0.48539677262306213, + "step": 18710 + }, + { + "ce_loss": 0.06905413419008255, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "distill_loss": 0.18162912130355835, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "ref_ce_loss": 0.05399453267455101, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "loss": 0.24727633595466614, + "step": 18710 + }, + { + "ce_loss": 0.022812293842434883, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "distill_loss": 0.11789606511592865, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "ref_ce_loss": 0.029683204367756844, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "loss": 0.30093979835510254, + "step": 18710 + }, + { + "ce_loss": 0.022619565948843956, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "distill_loss": 0.20535236597061157, + "epoch": 6.24082721814543, + "step": 18710 + }, + { + "epoch": 6.24082721814543, + "ref_ce_loss": 0.05883384123444557, + "step": 18710 + }, + { + "epoch": 6.244162775183455, + "loss": 0.323, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "grad_norm": 2.1610376834869385, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "learning_rate": 3.6329412578103386e-05, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "loss": 0.2980462312698364, + "step": 18720 + }, + { + "ce_loss": 0.02790931798517704, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "distill_loss": 0.15959838032722473, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "ref_ce_loss": 0.08582749217748642, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "loss": 0.28372853994369507, + "step": 18720 + }, + { + "ce_loss": 0.05045516416430473, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "distill_loss": 0.15782198309898376, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "ref_ce_loss": 0.058288849890232086, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "loss": 0.3082246482372284, + "step": 18720 + }, + { + "ce_loss": 0.04321841150522232, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "distill_loss": 0.15502133965492249, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "ref_ce_loss": 0.04743940383195877, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "loss": 0.3014211058616638, + "step": 18720 + }, + { + "ce_loss": 0.05305873975157738, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "distill_loss": 0.14042118191719055, + "epoch": 6.244162775183455, + "step": 18720 + }, + { + "epoch": 6.244162775183455, + "ref_ce_loss": 0.06918078660964966, + "step": 18720 + }, + { + "epoch": 6.247498332221481, + "loss": 0.3093, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "grad_norm": 2.835282325744629, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "learning_rate": 3.6197348507948085e-05, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "loss": 0.389096736907959, + "step": 18730 + }, + { + "ce_loss": 0.11738903820514679, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "distill_loss": 0.1438477784395218, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "ref_ce_loss": 0.06399208307266235, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "loss": 0.2826199531555176, + "step": 18730 + }, + { + "ce_loss": 0.020239872857928276, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "distill_loss": 0.1740793138742447, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "ref_ce_loss": 0.06345031410455704, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "loss": 0.49008870124816895, + "step": 18730 + }, + { + "ce_loss": 0.10059117525815964, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "distill_loss": 0.13844041526317596, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "ref_ce_loss": 0.07132607698440552, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "loss": 0.2719220817089081, + "step": 18730 + }, + { + "ce_loss": 0.05062330514192581, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "distill_loss": 0.1395432949066162, + "epoch": 6.247498332221481, + "step": 18730 + }, + { + "epoch": 6.247498332221481, + "ref_ce_loss": 0.05693845823407173, + "step": 18730 + }, + { + "epoch": 6.250833889259506, + "loss": 0.3179, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "grad_norm": 2.3968071937561035, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "learning_rate": 3.606549196893764e-05, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "loss": 0.35895127058029175, + "step": 18740 + }, + { + "ce_loss": 0.03985409811139107, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "distill_loss": 0.12473776191473007, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "ref_ce_loss": 0.03933054581284523, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "loss": 0.4552100896835327, + "step": 18740 + }, + { + "ce_loss": 0.0754159539937973, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "distill_loss": 0.16453154385089874, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "ref_ce_loss": 0.06217661872506142, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "loss": 0.2528969347476959, + "step": 18740 + }, + { + "ce_loss": 0.049444712698459625, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "distill_loss": 0.12250962853431702, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "ref_ce_loss": 0.05724600329995155, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "loss": 0.23986639082431793, + "step": 18740 + }, + { + "ce_loss": 0.02679796703159809, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "distill_loss": 0.15213319659233093, + "epoch": 6.250833889259506, + "step": 18740 + }, + { + "epoch": 6.250833889259506, + "ref_ce_loss": 0.06067092344164848, + "step": 18740 + }, + { + "epoch": 6.2541694462975315, + "loss": 0.35, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "grad_norm": 3.7008721828460693, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "learning_rate": 3.593384320152636e-05, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "loss": 0.25133639574050903, + "step": 18750 + }, + { + "ce_loss": 0.040532004088163376, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "distill_loss": 0.13248834013938904, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "ref_ce_loss": 0.04700193554162979, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "loss": 0.3965201675891876, + "step": 18750 + }, + { + "ce_loss": 0.07706707715988159, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "distill_loss": 0.21770936250686646, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "ref_ce_loss": 0.05477938801050186, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "loss": 0.5275173187255859, + "step": 18750 + }, + { + "ce_loss": 0.10376562178134918, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "distill_loss": 0.18406124413013458, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "ref_ce_loss": 0.11001072824001312, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "loss": 0.3048272728919983, + "step": 18750 + }, + { + "ce_loss": 0.02338281460106373, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "distill_loss": 0.15202832221984863, + "epoch": 6.2541694462975315, + "step": 18750 + }, + { + "epoch": 6.2541694462975315, + "ref_ce_loss": 0.08240233361721039, + "step": 18750 + }, + { + "epoch": 6.257505003335557, + "loss": 0.3237, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "grad_norm": 2.6808176040649414, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "learning_rate": 3.5802402445789625e-05, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "loss": 0.19271326065063477, + "step": 18760 + }, + { + "ce_loss": 0.020790955051779747, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "distill_loss": 0.11469073593616486, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "ref_ce_loss": 0.03989434987306595, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "loss": 0.43920618295669556, + "step": 18760 + }, + { + "ce_loss": 0.061031486839056015, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "distill_loss": 0.16793349385261536, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "ref_ce_loss": 0.08771173655986786, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "loss": 0.33124980330467224, + "step": 18760 + }, + { + "ce_loss": 0.042613252997398376, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "distill_loss": 0.1423777937889099, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "ref_ce_loss": 0.09082692116498947, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "loss": 0.47902119159698486, + "step": 18760 + }, + { + "ce_loss": 0.044841740280389786, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "distill_loss": 0.1492442935705185, + "epoch": 6.257505003335557, + "step": 18760 + }, + { + "epoch": 6.257505003335557, + "ref_ce_loss": 0.07554224133491516, + "step": 18760 + }, + { + "epoch": 6.260840560373582, + "loss": 0.3518, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "grad_norm": 3.183990716934204, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "learning_rate": 3.567116994142362e-05, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "loss": 0.20660609006881714, + "step": 18770 + }, + { + "ce_loss": 0.015359489247202873, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "distill_loss": 0.1334400177001953, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "ref_ce_loss": 0.05756957083940506, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "loss": 0.26321983337402344, + "step": 18770 + }, + { + "ce_loss": 0.022753337398171425, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "distill_loss": 0.1181974858045578, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "ref_ce_loss": 0.05340166762471199, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "loss": 0.24762709438800812, + "step": 18770 + }, + { + "ce_loss": 0.016061890870332718, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "distill_loss": 0.17953771352767944, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "ref_ce_loss": 0.051859185099601746, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "loss": 0.2978662848472595, + "step": 18770 + }, + { + "ce_loss": 0.056625351309776306, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "distill_loss": 0.13011780381202698, + "epoch": 6.260840560373582, + "step": 18770 + }, + { + "epoch": 6.260840560373582, + "ref_ce_loss": 0.0658249706029892, + "step": 18770 + }, + { + "epoch": 6.2641761174116075, + "loss": 0.331, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "grad_norm": 2.6765992641448975, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "learning_rate": 3.5540145927744554e-05, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "loss": 0.3210102617740631, + "step": 18780 + }, + { + "ce_loss": 0.05250520259141922, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "distill_loss": 0.17485934495925903, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "ref_ce_loss": 0.0640835240483284, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "loss": 0.2694230377674103, + "step": 18780 + }, + { + "ce_loss": 0.06711471080780029, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "distill_loss": 0.1430618166923523, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "ref_ce_loss": 0.059164922684431076, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "loss": 0.24888139963150024, + "step": 18780 + }, + { + "ce_loss": 0.02815820463001728, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "distill_loss": 0.13266366720199585, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "ref_ce_loss": 0.04049038141965866, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "loss": 0.36241692304611206, + "step": 18780 + }, + { + "ce_loss": 0.05183975026011467, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "distill_loss": 0.17315593361854553, + "epoch": 6.2641761174116075, + "step": 18780 + }, + { + "epoch": 6.2641761174116075, + "ref_ce_loss": 0.10023162513971329, + "step": 18780 + }, + { + "epoch": 6.267511674449633, + "loss": 0.3397, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "grad_norm": 2.6068735122680664, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "learning_rate": 3.540933064368857e-05, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "loss": 0.27424922585487366, + "step": 18790 + }, + { + "ce_loss": 0.02315659075975418, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "distill_loss": 0.14970842003822327, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "ref_ce_loss": 0.06472202390432358, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "loss": 0.272493839263916, + "step": 18790 + }, + { + "ce_loss": 0.029809942469000816, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "distill_loss": 0.13639044761657715, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "ref_ce_loss": 0.052156079560518265, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "loss": 0.5869401097297668, + "step": 18790 + }, + { + "ce_loss": 0.05751309543848038, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "distill_loss": 0.1983068883419037, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "ref_ce_loss": 0.08109751343727112, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "loss": 0.22548475861549377, + "step": 18790 + }, + { + "ce_loss": 0.05232026055455208, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "distill_loss": 0.1157417818903923, + "epoch": 6.267511674449633, + "step": 18790 + }, + { + "epoch": 6.267511674449633, + "ref_ce_loss": 0.03327852487564087, + "step": 18790 + }, + { + "epoch": 6.270847231487658, + "loss": 0.3617, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "grad_norm": 6.989676475524902, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "learning_rate": 3.5278724327811174e-05, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "loss": 0.23098401725292206, + "step": 18800 + }, + { + "ce_loss": 0.013874993659555912, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "distill_loss": 0.15344513952732086, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "ref_ce_loss": 0.03826308622956276, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "loss": 0.39530304074287415, + "step": 18800 + }, + { + "ce_loss": 0.05913930758833885, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "distill_loss": 0.2634964883327484, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "ref_ce_loss": 0.05968732014298439, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "loss": 0.4437573254108429, + "step": 18800 + }, + { + "ce_loss": 0.015780942514538765, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "distill_loss": 0.15896551311016083, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "ref_ce_loss": 0.050289690494537354, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "loss": 0.26852864027023315, + "step": 18800 + }, + { + "ce_loss": 0.03627710044384003, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "distill_loss": 0.11065933853387833, + "epoch": 6.270847231487658, + "step": 18800 + }, + { + "epoch": 6.270847231487658, + "ref_ce_loss": 0.07877654582262039, + "step": 18800 + }, + { + "epoch": 6.274182788525684, + "loss": 0.3798, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "grad_norm": 3.7645487785339355, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "learning_rate": 3.514832721828676e-05, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "loss": 0.46515122056007385, + "step": 18810 + }, + { + "ce_loss": 0.08517283201217651, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "distill_loss": 0.14585131406784058, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "ref_ce_loss": 0.055102892220020294, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "loss": 0.28196051716804504, + "step": 18810 + }, + { + "ce_loss": 0.07954999804496765, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "distill_loss": 0.10934408754110336, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "ref_ce_loss": 0.05305926129221916, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "loss": 0.34345611929893494, + "step": 18810 + }, + { + "ce_loss": 0.0444522500038147, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "distill_loss": 0.20444200932979584, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "ref_ce_loss": 0.0704997181892395, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "loss": 0.47482192516326904, + "step": 18810 + }, + { + "ce_loss": 0.0612056702375412, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "distill_loss": 0.15192171931266785, + "epoch": 6.274182788525684, + "step": 18810 + }, + { + "epoch": 6.274182788525684, + "ref_ce_loss": 0.050350818783044815, + "step": 18810 + }, + { + "epoch": 6.277518345563709, + "loss": 0.3319, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "grad_norm": 3.944410800933838, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "learning_rate": 3.501813955290823e-05, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "loss": 0.23156653344631195, + "step": 18820 + }, + { + "ce_loss": 0.025693096220493317, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "distill_loss": 0.12437605857849121, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "ref_ce_loss": 0.06192917004227638, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "loss": 0.34078988432884216, + "step": 18820 + }, + { + "ce_loss": 0.07537341117858887, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "distill_loss": 0.1982569396495819, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "ref_ce_loss": 0.06689344346523285, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "loss": 0.30715587735176086, + "step": 18820 + }, + { + "ce_loss": 0.028312131762504578, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "distill_loss": 0.19388048350811005, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "ref_ce_loss": 0.04899001121520996, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "loss": 0.26771223545074463, + "step": 18820 + }, + { + "ce_loss": 0.0458672009408474, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "distill_loss": 0.12816867232322693, + "epoch": 6.277518345563709, + "step": 18820 + }, + { + "epoch": 6.277518345563709, + "ref_ce_loss": 0.05818053334951401, + "step": 18820 + }, + { + "epoch": 6.280853902601734, + "loss": 0.3275, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "grad_norm": 2.262253522872925, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "learning_rate": 3.48881615690865e-05, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "loss": 0.640660285949707, + "step": 18830 + }, + { + "ce_loss": 0.04574522003531456, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "distill_loss": 0.18484735488891602, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "ref_ce_loss": 0.0488605722784996, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "loss": 0.3685927391052246, + "step": 18830 + }, + { + "ce_loss": 0.05379222705960274, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "distill_loss": 0.12785358726978302, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "ref_ce_loss": 0.07150807976722717, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "loss": 0.5330727696418762, + "step": 18830 + }, + { + "ce_loss": 0.07434044033288956, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "distill_loss": 0.19990497827529907, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "ref_ce_loss": 0.06825008243322372, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "loss": 0.5475145578384399, + "step": 18830 + }, + { + "ce_loss": 0.05228644236922264, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "distill_loss": 0.15708759427070618, + "epoch": 6.280853902601734, + "step": 18830 + }, + { + "epoch": 6.280853902601734, + "ref_ce_loss": 0.03966747596859932, + "step": 18830 + }, + { + "epoch": 6.28418945963976, + "loss": 0.3523, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "grad_norm": 2.7833263874053955, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "learning_rate": 3.475839350385014e-05, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "loss": 0.23492398858070374, + "step": 18840 + }, + { + "ce_loss": 0.006972161587327719, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "distill_loss": 0.10143738985061646, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "ref_ce_loss": 0.04588712379336357, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "loss": 0.3233441412448883, + "step": 18840 + }, + { + "ce_loss": 0.048174165189266205, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "distill_loss": 0.19731523096561432, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "ref_ce_loss": 0.059398673474788666, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "loss": 0.43290847539901733, + "step": 18840 + }, + { + "ce_loss": 0.07412654906511307, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "distill_loss": 0.226647287607193, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "ref_ce_loss": 0.05764947459101677, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "loss": 0.35213813185691833, + "step": 18840 + }, + { + "ce_loss": 0.03941421955823898, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "distill_loss": 0.20007504522800446, + "epoch": 6.28418945963976, + "step": 18840 + }, + { + "epoch": 6.28418945963976, + "ref_ce_loss": 0.05939985811710358, + "step": 18840 + }, + { + "epoch": 6.287525016677785, + "loss": 0.3877, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "grad_norm": 2.940953016281128, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "learning_rate": 3.462883559384492e-05, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "loss": 0.2711646556854248, + "step": 18850 + }, + { + "ce_loss": 0.06784234195947647, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "distill_loss": 0.10876431316137314, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "ref_ce_loss": 0.058501046150922775, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "loss": 0.19347287714481354, + "step": 18850 + }, + { + "ce_loss": 0.026580985635519028, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "distill_loss": 0.12169019132852554, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "ref_ce_loss": 0.04512413963675499, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "loss": 0.2864726483821869, + "step": 18850 + }, + { + "ce_loss": 0.03934826701879501, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "distill_loss": 0.12577557563781738, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "ref_ce_loss": 0.07803640514612198, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "loss": 0.16455747187137604, + "step": 18850 + }, + { + "ce_loss": 0.02695547230541706, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "distill_loss": 0.0886109471321106, + "epoch": 6.287525016677785, + "step": 18850 + }, + { + "epoch": 6.287525016677785, + "ref_ce_loss": 0.03705105558037758, + "step": 18850 + }, + { + "epoch": 6.29086057371581, + "loss": 0.3807, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "grad_norm": 4.630390167236328, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "learning_rate": 3.449948807533337e-05, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "loss": 0.21386204659938812, + "step": 18860 + }, + { + "ce_loss": 0.026135116815567017, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "distill_loss": 0.13585272431373596, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "ref_ce_loss": 0.034613024443387985, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "loss": 0.31926730275154114, + "step": 18860 + }, + { + "ce_loss": 0.03757088631391525, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "distill_loss": 0.1286323070526123, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "ref_ce_loss": 0.046050816774368286, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "loss": 0.37816765904426575, + "step": 18860 + }, + { + "ce_loss": 0.12475141882896423, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "distill_loss": 0.16274906694889069, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "ref_ce_loss": 0.08075873553752899, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "loss": 0.24097076058387756, + "step": 18860 + }, + { + "ce_loss": 0.019521335139870644, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "distill_loss": 0.12991663813591003, + "epoch": 6.29086057371581, + "step": 18860 + }, + { + "epoch": 6.29086057371581, + "ref_ce_loss": 0.052921973168849945, + "step": 18860 + }, + { + "epoch": 6.294196130753836, + "loss": 0.351, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "grad_norm": 6.742186546325684, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "learning_rate": 3.437035118419439e-05, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "loss": 0.21090252697467804, + "step": 18870 + }, + { + "ce_loss": 0.038921330124139786, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "distill_loss": 0.12278036773204803, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "ref_ce_loss": 0.03309723734855652, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "loss": 0.25562670826911926, + "step": 18870 + }, + { + "ce_loss": 0.0049499752931296825, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "distill_loss": 0.20920096337795258, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "ref_ce_loss": 0.02717713825404644, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "loss": 0.6847727298736572, + "step": 18870 + }, + { + "ce_loss": 0.06199228763580322, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "distill_loss": 0.18934482336044312, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "ref_ce_loss": 0.045317307114601135, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "loss": 0.2877572178840637, + "step": 18870 + }, + { + "ce_loss": 0.05759180709719658, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "distill_loss": 0.12327567487955093, + "epoch": 6.294196130753836, + "step": 18870 + }, + { + "epoch": 6.294196130753836, + "ref_ce_loss": 0.07539641857147217, + "step": 18870 + }, + { + "epoch": 6.297531687791861, + "loss": 0.3752, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "grad_norm": 3.315523386001587, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "learning_rate": 3.4241425155922634e-05, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "loss": 0.29335570335388184, + "step": 18880 + }, + { + "ce_loss": 0.06978707015514374, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "distill_loss": 0.14715451002120972, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "ref_ce_loss": 0.060952696949243546, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "loss": 0.44177842140197754, + "step": 18880 + }, + { + "ce_loss": 0.04377702251076698, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "distill_loss": 0.1425521969795227, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "ref_ce_loss": 0.06627517193555832, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "loss": 0.2356526106595993, + "step": 18880 + }, + { + "ce_loss": 0.03892301395535469, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "distill_loss": 0.12812058627605438, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "ref_ce_loss": 0.050661858171224594, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "loss": 0.19576594233512878, + "step": 18880 + }, + { + "ce_loss": 0.022691428661346436, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "distill_loss": 0.1026381254196167, + "epoch": 6.297531687791861, + "step": 18880 + }, + { + "epoch": 6.297531687791861, + "ref_ce_loss": 0.05652888864278793, + "step": 18880 + }, + { + "epoch": 6.300867244829886, + "loss": 0.3281, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "grad_norm": 3.531867027282715, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "learning_rate": 3.4112710225628344e-05, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "loss": 0.45936667919158936, + "step": 18890 + }, + { + "ce_loss": 0.06959517300128937, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "distill_loss": 0.20052653551101685, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "ref_ce_loss": 0.046475961804389954, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "loss": 0.22902725636959076, + "step": 18890 + }, + { + "ce_loss": 0.030058017000555992, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "distill_loss": 0.1144430935382843, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "ref_ce_loss": 0.051945485174655914, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "loss": 0.46311238408088684, + "step": 18890 + }, + { + "ce_loss": 0.042298149317502975, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "distill_loss": 0.3087809383869171, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "ref_ce_loss": 0.05733121559023857, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "loss": 0.34530389308929443, + "step": 18890 + }, + { + "ce_loss": 0.030319813638925552, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "distill_loss": 0.15547746419906616, + "epoch": 6.300867244829886, + "step": 18890 + }, + { + "epoch": 6.300867244829886, + "ref_ce_loss": 0.08323720842599869, + "step": 18890 + }, + { + "epoch": 6.304202801867912, + "loss": 0.3448, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "grad_norm": 2.434091091156006, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "learning_rate": 3.398420662803684e-05, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "loss": 0.2415168285369873, + "step": 18900 + }, + { + "ce_loss": 0.020679494366049767, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "distill_loss": 0.16226181387901306, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "ref_ce_loss": 0.024102669209241867, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "loss": 0.2756112515926361, + "step": 18900 + }, + { + "ce_loss": 0.03828546032309532, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "distill_loss": 0.14190034568309784, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "ref_ce_loss": 0.044629864394664764, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "loss": 0.2844380736351013, + "step": 18900 + }, + { + "ce_loss": 0.0461580716073513, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "distill_loss": 0.12029999494552612, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "ref_ce_loss": 0.07012398540973663, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "loss": 0.2737322449684143, + "step": 18900 + }, + { + "ce_loss": 0.04904641583561897, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "distill_loss": 0.12067732214927673, + "epoch": 6.304202801867912, + "step": 18900 + }, + { + "epoch": 6.304202801867912, + "ref_ce_loss": 0.05247277021408081, + "step": 18900 + }, + { + "epoch": 6.307538358905937, + "loss": 0.3068, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "grad_norm": 2.252596139907837, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "learning_rate": 3.385591459748793e-05, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "loss": 0.29819396138191223, + "step": 18910 + }, + { + "ce_loss": 0.05503129959106445, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "distill_loss": 0.1623593419790268, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "ref_ce_loss": 0.08060084283351898, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "loss": 0.37681642174720764, + "step": 18910 + }, + { + "ce_loss": 0.05450444296002388, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "distill_loss": 0.20877453684806824, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "ref_ce_loss": 0.0601249523460865, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "loss": 0.39602282643318176, + "step": 18910 + }, + { + "ce_loss": 0.08872856199741364, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "distill_loss": 0.17702454328536987, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "ref_ce_loss": 0.09893164783716202, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "loss": 0.25537437200546265, + "step": 18910 + }, + { + "ce_loss": 0.016225768253207207, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "distill_loss": 0.13233773410320282, + "epoch": 6.307538358905937, + "step": 18910 + }, + { + "epoch": 6.307538358905937, + "ref_ce_loss": 0.04172232374548912, + "step": 18910 + }, + { + "epoch": 6.310873915943962, + "loss": 0.3138, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "grad_norm": 2.0156707763671875, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "learning_rate": 3.3727834367935634e-05, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "loss": 0.4027228355407715, + "step": 18920 + }, + { + "ce_loss": 0.09059377759695053, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "distill_loss": 0.16408628225326538, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "ref_ce_loss": 0.046289198100566864, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "loss": 0.4065344035625458, + "step": 18920 + }, + { + "ce_loss": 0.04324747249484062, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "distill_loss": 0.21216806769371033, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "ref_ce_loss": 0.08379409462213516, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "loss": 0.28860020637512207, + "step": 18920 + }, + { + "ce_loss": 0.0696495994925499, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "distill_loss": 0.15288947522640228, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "ref_ce_loss": 0.06594584882259369, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "loss": 0.5158238410949707, + "step": 18920 + }, + { + "ce_loss": 0.07455329596996307, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "distill_loss": 0.33285900950431824, + "epoch": 6.310873915943962, + "step": 18920 + }, + { + "epoch": 6.310873915943962, + "ref_ce_loss": 0.07242995500564575, + "step": 18920 + }, + { + "epoch": 6.314209472981988, + "loss": 0.3283, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "grad_norm": 2.4332995414733887, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "learning_rate": 3.35999661729479e-05, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "loss": 0.4734187722206116, + "step": 18930 + }, + { + "ce_loss": 0.051276229321956635, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "distill_loss": 0.21224486827850342, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "ref_ce_loss": 0.05212824419140816, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "loss": 0.2177160680294037, + "step": 18930 + }, + { + "ce_loss": 0.033990684896707535, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "distill_loss": 0.15057498216629028, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "ref_ce_loss": 0.03305617719888687, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "loss": 0.4837043881416321, + "step": 18930 + }, + { + "ce_loss": 0.06694763898849487, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "distill_loss": 0.2243620753288269, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "ref_ce_loss": 0.07067155838012695, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "loss": 0.34912294149398804, + "step": 18930 + }, + { + "ce_loss": 0.06255176663398743, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "distill_loss": 0.1636684238910675, + "epoch": 6.314209472981988, + "step": 18930 + }, + { + "epoch": 6.314209472981988, + "ref_ce_loss": 0.06407450139522552, + "step": 18930 + }, + { + "epoch": 6.317545030020013, + "loss": 0.3386, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "grad_norm": 3.589062213897705, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "learning_rate": 3.347231024570578e-05, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "loss": 0.2020236700773239, + "step": 18940 + }, + { + "ce_loss": 0.031198319047689438, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "distill_loss": 0.11204557120800018, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "ref_ce_loss": 0.05850578099489212, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "loss": 0.2590986490249634, + "step": 18940 + }, + { + "ce_loss": 0.03589296713471413, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "distill_loss": 0.17840811610221863, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "ref_ce_loss": 0.04469051584601402, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "loss": 0.25700971484184265, + "step": 18940 + }, + { + "ce_loss": 0.018838005140423775, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "distill_loss": 0.18496237695217133, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "ref_ce_loss": 0.03536644205451012, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "loss": 0.19318623840808868, + "step": 18940 + }, + { + "ce_loss": 0.011522611603140831, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "distill_loss": 0.10571037232875824, + "epoch": 6.317545030020013, + "step": 18940 + }, + { + "epoch": 6.317545030020013, + "ref_ce_loss": 0.04865308105945587, + "step": 18940 + }, + { + "epoch": 6.3208805870580385, + "loss": 0.3627, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "grad_norm": 2.7662694454193115, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "learning_rate": 3.3344866819003374e-05, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "loss": 0.35670697689056396, + "step": 18950 + }, + { + "ce_loss": 0.031022926792502403, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "distill_loss": 0.1557820737361908, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "ref_ce_loss": 0.04982922598719597, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "loss": 0.2733045220375061, + "step": 18950 + }, + { + "ce_loss": 0.04908668249845505, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "distill_loss": 0.11684387177228928, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "ref_ce_loss": 0.055443841964006424, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "loss": 0.2913722097873688, + "step": 18950 + }, + { + "ce_loss": 0.06559805572032928, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "distill_loss": 0.1549169272184372, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "ref_ce_loss": 0.0595548078417778, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "loss": 0.2763633131980896, + "step": 18950 + }, + { + "ce_loss": 0.05452756956219673, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "distill_loss": 0.1420384794473648, + "epoch": 6.3208805870580385, + "step": 18950 + }, + { + "epoch": 6.3208805870580385, + "ref_ce_loss": 0.05865325778722763, + "step": 18950 + }, + { + "epoch": 6.324216144096064, + "loss": 0.3197, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "grad_norm": 3.9764132499694824, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "learning_rate": 3.321763612524716e-05, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "loss": 0.2885870039463043, + "step": 18960 + }, + { + "ce_loss": 0.019036101177334785, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "distill_loss": 0.14021548628807068, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "ref_ce_loss": 0.06481176614761353, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "loss": 0.3427791893482208, + "step": 18960 + }, + { + "ce_loss": 0.039608050137758255, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "distill_loss": 0.14981134235858917, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "ref_ce_loss": 0.08245816081762314, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "loss": 0.42633309960365295, + "step": 18960 + }, + { + "ce_loss": 0.05362397059798241, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "distill_loss": 0.24959130585193634, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "ref_ce_loss": 0.04351172223687172, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "loss": 0.43009763956069946, + "step": 18960 + }, + { + "ce_loss": 0.11033513396978378, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "distill_loss": 0.16854149103164673, + "epoch": 6.324216144096064, + "step": 18960 + }, + { + "epoch": 6.324216144096064, + "ref_ce_loss": 0.08382659405469894, + "step": 18960 + }, + { + "epoch": 6.327551701134089, + "loss": 0.3519, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "grad_norm": 3.6981232166290283, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "learning_rate": 3.309061839645578e-05, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "loss": 0.37140506505966187, + "step": 18970 + }, + { + "ce_loss": 0.0781472697854042, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "distill_loss": 0.17404945194721222, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "ref_ce_loss": 0.07811552286148071, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "loss": 0.27958112955093384, + "step": 18970 + }, + { + "ce_loss": 0.04201284795999527, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "distill_loss": 0.13044360280036926, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "ref_ce_loss": 0.048841409385204315, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "loss": 0.3533022999763489, + "step": 18970 + }, + { + "ce_loss": 0.05124984681606293, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "distill_loss": 0.23570401966571808, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "ref_ce_loss": 0.05524509772658348, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "loss": 0.41169291734695435, + "step": 18970 + }, + { + "ce_loss": 0.07361854612827301, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "distill_loss": 0.22925794124603271, + "epoch": 6.327551701134089, + "step": 18970 + }, + { + "epoch": 6.327551701134089, + "ref_ce_loss": 0.07110333442687988, + "step": 18970 + }, + { + "epoch": 6.3308872581721145, + "loss": 0.3328, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "grad_norm": 4.419796943664551, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "learning_rate": 3.2963813864259436e-05, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "loss": 0.3489697575569153, + "step": 18980 + }, + { + "ce_loss": 0.019743165001273155, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "distill_loss": 0.180571511387825, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "ref_ce_loss": 0.047717832028865814, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "loss": 0.2253534197807312, + "step": 18980 + }, + { + "ce_loss": 0.03531552106142044, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "distill_loss": 0.1357894390821457, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "ref_ce_loss": 0.054144252091646194, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "loss": 0.30089667439460754, + "step": 18980 + }, + { + "ce_loss": 0.025029117241501808, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "distill_loss": 0.14975345134735107, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "ref_ce_loss": 0.05282760038971901, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "loss": 0.3071041703224182, + "step": 18980 + }, + { + "ce_loss": 0.035489119589328766, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "distill_loss": 0.1634751409292221, + "epoch": 6.3308872581721145, + "step": 18980 + }, + { + "epoch": 6.3308872581721145, + "ref_ce_loss": 0.04332876205444336, + "step": 18980 + }, + { + "epoch": 6.33422281521014, + "loss": 0.3604, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "grad_norm": 3.840822458267212, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "learning_rate": 3.2837222759899615e-05, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "loss": 0.26387926936149597, + "step": 18990 + }, + { + "ce_loss": 0.033305756747722626, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "distill_loss": 0.17537467181682587, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "ref_ce_loss": 0.04105931892991066, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "loss": 0.4222206771373749, + "step": 18990 + }, + { + "ce_loss": 0.04392791911959648, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "distill_loss": 0.1172325387597084, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "ref_ce_loss": 0.0700860545039177, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "loss": 0.22235901653766632, + "step": 18990 + }, + { + "ce_loss": 0.019134419038891792, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "distill_loss": 0.13299764692783356, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "ref_ce_loss": 0.05059622600674629, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "loss": 0.3985365629196167, + "step": 18990 + }, + { + "ce_loss": 0.029248295351862907, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "distill_loss": 0.19520241022109985, + "epoch": 6.33422281521014, + "step": 18990 + }, + { + "epoch": 6.33422281521014, + "ref_ce_loss": 0.048385899513959885, + "step": 18990 + }, + { + "epoch": 6.337558372248165, + "loss": 0.3584, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "grad_norm": 4.76231575012207, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "learning_rate": 3.271084531422857e-05, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "loss": 0.36909541487693787, + "step": 19000 + }, + { + "ce_loss": 0.05047956109046936, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "distill_loss": 0.12576617300510406, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "ref_ce_loss": 0.08630871772766113, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "loss": 0.346733421087265, + "step": 19000 + }, + { + "ce_loss": 0.08098523318767548, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "distill_loss": 0.1754927784204483, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "ref_ce_loss": 0.09000211209058762, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "loss": 0.2860293984413147, + "step": 19000 + }, + { + "ce_loss": 0.027446920052170753, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "distill_loss": 0.1255069524049759, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "ref_ce_loss": 0.05317498371005058, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "loss": 0.24360382556915283, + "step": 19000 + }, + { + "ce_loss": 0.028923992067575455, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "distill_loss": 0.1311366856098175, + "epoch": 6.337558372248165, + "step": 19000 + }, + { + "epoch": 6.337558372248165, + "ref_ce_loss": 0.03659401834011078, + "step": 19000 + }, + { + "epoch": 6.3408939292861906, + "loss": 0.2933, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "grad_norm": 2.842592477798462, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "learning_rate": 3.258468175770884e-05, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "loss": 0.2973235845565796, + "step": 19010 + }, + { + "ce_loss": 0.03437792509794235, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "distill_loss": 0.10890894383192062, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "ref_ce_loss": 0.08101943135261536, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "loss": 0.25962209701538086, + "step": 19010 + }, + { + "ce_loss": 0.02973208576440811, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "distill_loss": 0.10564002394676208, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "ref_ce_loss": 0.04377582296729088, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "loss": 0.4331674575805664, + "step": 19010 + }, + { + "ce_loss": 0.053108036518096924, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "distill_loss": 0.14030146598815918, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "ref_ce_loss": 0.09056838601827621, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "loss": 0.2938518524169922, + "step": 19010 + }, + { + "ce_loss": 0.045009806752204895, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "distill_loss": 0.1620580404996872, + "epoch": 6.3408939292861906, + "step": 19010 + }, + { + "epoch": 6.3408939292861906, + "ref_ce_loss": 0.049979884177446365, + "step": 19010 + }, + { + "epoch": 6.344229486324216, + "loss": 0.2939, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "grad_norm": 8.198354721069336, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "learning_rate": 3.245873232041302e-05, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "loss": 0.3113076090812683, + "step": 19020 + }, + { + "ce_loss": 0.0427481047809124, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "distill_loss": 0.1602908819913864, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "ref_ce_loss": 0.08124813437461853, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "loss": 0.3438887894153595, + "step": 19020 + }, + { + "ce_loss": 0.031248973682522774, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "distill_loss": 0.20599618554115295, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "ref_ce_loss": 0.07038242369890213, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "loss": 0.43153202533721924, + "step": 19020 + }, + { + "ce_loss": 0.021240759640932083, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "distill_loss": 0.13821086287498474, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "ref_ce_loss": 0.05561009421944618, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "loss": 0.4814144968986511, + "step": 19020 + }, + { + "ce_loss": 0.043110962957143784, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "distill_loss": 0.3201991617679596, + "epoch": 6.344229486324216, + "step": 19020 + }, + { + "epoch": 6.344229486324216, + "ref_ce_loss": 0.09728872776031494, + "step": 19020 + }, + { + "epoch": 6.347565043362241, + "loss": 0.3491, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "grad_norm": 2.9248974323272705, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "learning_rate": 3.233299723202319e-05, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "loss": 0.32620903849601746, + "step": 19030 + }, + { + "ce_loss": 0.0165343526750803, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "distill_loss": 0.09083996713161469, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "ref_ce_loss": 0.05633927881717682, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "loss": 0.28984490036964417, + "step": 19030 + }, + { + "ce_loss": 0.04113488271832466, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "distill_loss": 0.20167671144008636, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "ref_ce_loss": 0.04684454947710037, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "loss": 0.32723718881607056, + "step": 19030 + }, + { + "ce_loss": 0.022285494953393936, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "distill_loss": 0.15769782662391663, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "ref_ce_loss": 0.0603957436978817, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "loss": 0.40456095337867737, + "step": 19030 + }, + { + "ce_loss": 0.012262187898159027, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "distill_loss": 0.17209021747112274, + "epoch": 6.347565043362241, + "step": 19030 + }, + { + "epoch": 6.347565043362241, + "ref_ce_loss": 0.06936907768249512, + "step": 19030 + }, + { + "epoch": 6.350900600400267, + "loss": 0.3313, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "grad_norm": 4.83967924118042, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "learning_rate": 3.2207476721830575e-05, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "loss": 0.34516626596450806, + "step": 19040 + }, + { + "ce_loss": 0.02484416589140892, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "distill_loss": 0.20764781534671783, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "ref_ce_loss": 0.11241208761930466, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "loss": 0.314484566450119, + "step": 19040 + }, + { + "ce_loss": 0.07472291588783264, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "distill_loss": 0.1334226131439209, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "ref_ce_loss": 0.06843020021915436, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "loss": 0.2526988387107849, + "step": 19040 + }, + { + "ce_loss": 0.08345230668783188, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "distill_loss": 0.12088726460933685, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "ref_ce_loss": 0.03264693170785904, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "loss": 0.22657568752765656, + "step": 19040 + }, + { + "ce_loss": 0.023793227970600128, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "distill_loss": 0.13206267356872559, + "epoch": 6.350900600400267, + "step": 19040 + }, + { + "epoch": 6.350900600400267, + "ref_ce_loss": 0.07039143890142441, + "step": 19040 + }, + { + "epoch": 6.354236157438292, + "loss": 0.3438, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "grad_norm": 5.042229652404785, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "learning_rate": 3.208217101873505e-05, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "loss": 0.22500789165496826, + "step": 19050 + }, + { + "ce_loss": 0.004221898503601551, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "distill_loss": 0.1627054512500763, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "ref_ce_loss": 0.05779343843460083, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "loss": 0.19450661540031433, + "step": 19050 + }, + { + "ce_loss": 0.008154689334332943, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "distill_loss": 0.10214529931545258, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "ref_ce_loss": 0.04879332706332207, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "loss": 0.9522825479507446, + "step": 19050 + }, + { + "ce_loss": 0.027610354125499725, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "distill_loss": 0.1594891995191574, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "ref_ce_loss": 0.0504218228161335, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "loss": 0.5254677534103394, + "step": 19050 + }, + { + "ce_loss": 0.07183732837438583, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "distill_loss": 0.17535251379013062, + "epoch": 6.354236157438292, + "step": 19050 + }, + { + "epoch": 6.354236157438292, + "ref_ce_loss": 0.05224858969449997, + "step": 19050 + }, + { + "epoch": 6.357571714476317, + "loss": 0.342, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "grad_norm": 2.714850664138794, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "learning_rate": 3.195708035124485e-05, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "loss": 0.28869467973709106, + "step": 19060 + }, + { + "ce_loss": 0.009162982925772667, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "distill_loss": 0.15321744978427887, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "ref_ce_loss": 0.04659789428114891, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "loss": 0.4857400059700012, + "step": 19060 + }, + { + "ce_loss": 0.0692988932132721, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "distill_loss": 0.16876783967018127, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "ref_ce_loss": 0.07196694612503052, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "loss": 0.2311512678861618, + "step": 19060 + }, + { + "ce_loss": 0.002413057256489992, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "distill_loss": 0.1793450266122818, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "ref_ce_loss": 0.04919705539941788, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "loss": 0.4767001271247864, + "step": 19060 + }, + { + "ce_loss": 0.0869223102927208, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "distill_loss": 0.2395099699497223, + "epoch": 6.357571714476317, + "step": 19060 + }, + { + "epoch": 6.357571714476317, + "ref_ce_loss": 0.06657543778419495, + "step": 19060 + }, + { + "epoch": 6.360907271514343, + "loss": 0.328, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "grad_norm": 2.230041265487671, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "learning_rate": 3.183220494747591e-05, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "loss": 0.3135024607181549, + "step": 19070 + }, + { + "ce_loss": 0.05656009167432785, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "distill_loss": 0.1409039944410324, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "ref_ce_loss": 0.08445709198713303, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "loss": 0.3105989992618561, + "step": 19070 + }, + { + "ce_loss": 0.055232059210538864, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "distill_loss": 0.17802393436431885, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "ref_ce_loss": 0.058957234025001526, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "loss": 0.2731439471244812, + "step": 19070 + }, + { + "ce_loss": 0.02287127636373043, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "distill_loss": 0.1520075649023056, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "ref_ce_loss": 0.05730840191245079, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "loss": 0.21526774764060974, + "step": 19070 + }, + { + "ce_loss": 0.015572651289403439, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "distill_loss": 0.10528266429901123, + "epoch": 6.360907271514343, + "step": 19070 + }, + { + "epoch": 6.360907271514343, + "ref_ce_loss": 0.0473422110080719, + "step": 19070 + }, + { + "epoch": 6.364242828552368, + "loss": 0.3333, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "grad_norm": 2.177327871322632, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "learning_rate": 3.170754503515176e-05, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "loss": 0.21960368752479553, + "step": 19080 + }, + { + "ce_loss": 0.004566080868244171, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "distill_loss": 0.12495573610067368, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "ref_ce_loss": 0.051851414144039154, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "loss": 0.23354806005954742, + "step": 19080 + }, + { + "ce_loss": 0.030264711007475853, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "distill_loss": 0.09896160662174225, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "ref_ce_loss": 0.05575990676879883, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "loss": 0.353571355342865, + "step": 19080 + }, + { + "ce_loss": 0.10856593400239944, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "distill_loss": 0.1839468628168106, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "ref_ce_loss": 0.060834918171167374, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "loss": 0.3653889298439026, + "step": 19080 + }, + { + "ce_loss": 0.1770656853914261, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "distill_loss": 0.09319852292537689, + "epoch": 6.364242828552368, + "step": 19080 + }, + { + "epoch": 6.364242828552368, + "ref_ce_loss": 0.07829434424638748, + "step": 19080 + }, + { + "epoch": 6.367578385590393, + "loss": 0.3147, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "grad_norm": 2.2435762882232666, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "learning_rate": 3.15831008416029e-05, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "loss": 0.1861688494682312, + "step": 19090 + }, + { + "ce_loss": 0.04676009714603424, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "distill_loss": 0.09529206156730652, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "ref_ce_loss": 0.04402006417512894, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "loss": 0.2757784426212311, + "step": 19090 + }, + { + "ce_loss": 0.027549242600798607, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "distill_loss": 0.1115616038441658, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "ref_ce_loss": 0.0614810436964035, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "loss": 0.2042546570301056, + "step": 19090 + }, + { + "ce_loss": 0.008537779562175274, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "distill_loss": 0.09340881556272507, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "ref_ce_loss": 0.03826262801885605, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "loss": 0.27964353561401367, + "step": 19090 + }, + { + "ce_loss": 0.00564216123893857, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "distill_loss": 0.12422443181276321, + "epoch": 6.367578385590393, + "step": 19090 + }, + { + "epoch": 6.367578385590393, + "ref_ce_loss": 0.03275144472718239, + "step": 19090 + }, + { + "epoch": 6.370913942628419, + "loss": 0.3159, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "grad_norm": 2.6322593688964844, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "learning_rate": 3.1458872593766445e-05, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "loss": 0.2862546741962433, + "step": 19100 + }, + { + "ce_loss": 0.02663441374897957, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "distill_loss": 0.14039082825183868, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "ref_ce_loss": 0.053199347108602524, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "loss": 0.3178987503051758, + "step": 19100 + }, + { + "ce_loss": 0.04347086697816849, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "distill_loss": 0.20151656866073608, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "ref_ce_loss": 0.07276992499828339, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "loss": 0.32933807373046875, + "step": 19100 + }, + { + "ce_loss": 0.07511289417743683, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "distill_loss": 0.16823294758796692, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "ref_ce_loss": 0.08579640090465546, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "loss": 0.3977130353450775, + "step": 19100 + }, + { + "ce_loss": 0.013505751267075539, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "distill_loss": 0.25070807337760925, + "epoch": 6.370913942628419, + "step": 19100 + }, + { + "epoch": 6.370913942628419, + "ref_ce_loss": 0.0833851620554924, + "step": 19100 + }, + { + "epoch": 6.374249499666444, + "loss": 0.3429, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "grad_norm": 3.5291786193847656, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "learning_rate": 3.133486051818576e-05, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "loss": 0.3006848692893982, + "step": 19110 + }, + { + "ce_loss": 0.07178663462400436, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "distill_loss": 0.16004428267478943, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "ref_ce_loss": 0.05003441497683525, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "loss": 0.37611380219459534, + "step": 19110 + }, + { + "ce_loss": 0.06637578457593918, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "distill_loss": 0.1888526827096939, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "ref_ce_loss": 0.07265367358922958, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "loss": 0.31075137853622437, + "step": 19110 + }, + { + "ce_loss": 0.048460204154253006, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "distill_loss": 0.1777610331773758, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "ref_ce_loss": 0.06488906592130661, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "loss": 0.2050069272518158, + "step": 19110 + }, + { + "ce_loss": 0.012430096045136452, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "distill_loss": 0.11187323927879333, + "epoch": 6.374249499666444, + "step": 19110 + }, + { + "epoch": 6.374249499666444, + "ref_ce_loss": 0.061666443943977356, + "step": 19110 + }, + { + "epoch": 6.377585056704469, + "loss": 0.3316, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "grad_norm": 2.4090237617492676, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "learning_rate": 3.121106484100988e-05, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "loss": 0.44150298833847046, + "step": 19120 + }, + { + "ce_loss": 0.1467040777206421, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "distill_loss": 0.17174892127513885, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "ref_ce_loss": 0.1003832221031189, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "loss": 0.22515568137168884, + "step": 19120 + }, + { + "ce_loss": 0.017367742955684662, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "distill_loss": 0.15258149802684784, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "ref_ce_loss": 0.05509953573346138, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "loss": 0.4535076320171356, + "step": 19120 + }, + { + "ce_loss": 0.03954509273171425, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "distill_loss": 0.2209874838590622, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "ref_ce_loss": 0.10431575775146484, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "loss": 0.2742467522621155, + "step": 19120 + }, + { + "ce_loss": 0.029925430193543434, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "distill_loss": 0.1687852144241333, + "epoch": 6.377585056704469, + "step": 19120 + }, + { + "epoch": 6.377585056704469, + "ref_ce_loss": 0.04623578116297722, + "step": 19120 + }, + { + "epoch": 6.380920613742495, + "loss": 0.3146, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "grad_norm": 2.218641996383667, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "learning_rate": 3.1087485787993364e-05, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "loss": 0.4525134563446045, + "step": 19130 + }, + { + "ce_loss": 0.06234154850244522, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "distill_loss": 0.21077755093574524, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "ref_ce_loss": 0.04453861340880394, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "loss": 0.5359585285186768, + "step": 19130 + }, + { + "ce_loss": 0.05334095284342766, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "distill_loss": 0.15786990523338318, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "ref_ce_loss": 0.06534440070390701, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "loss": 0.2719331383705139, + "step": 19130 + }, + { + "ce_loss": 0.03220943361520767, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "distill_loss": 0.15897700190544128, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "ref_ce_loss": 0.059988368302583694, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "loss": 0.31654953956604004, + "step": 19130 + }, + { + "ce_loss": 0.07118566334247589, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "distill_loss": 0.1746058613061905, + "epoch": 6.380920613742495, + "step": 19130 + }, + { + "epoch": 6.380920613742495, + "ref_ce_loss": 0.06019550561904907, + "step": 19130 + }, + { + "epoch": 6.38425617078052, + "loss": 0.3493, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "grad_norm": 2.842442750930786, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "learning_rate": 3.096412358449551e-05, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "loss": 0.3088509440422058, + "step": 19140 + }, + { + "ce_loss": 0.013124763034284115, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "distill_loss": 0.20202720165252686, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "ref_ce_loss": 0.041209910064935684, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "loss": 0.28975409269332886, + "step": 19140 + }, + { + "ce_loss": 0.0419902503490448, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "distill_loss": 0.15306031703948975, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "ref_ce_loss": 0.04372680187225342, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "loss": 0.264797568321228, + "step": 19140 + }, + { + "ce_loss": 0.009839768521487713, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "distill_loss": 0.1636621654033661, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "ref_ce_loss": 0.046388011425733566, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "loss": 0.45161283016204834, + "step": 19140 + }, + { + "ce_loss": 0.033660754561424255, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "distill_loss": 0.30521446466445923, + "epoch": 6.38425617078052, + "step": 19140 + }, + { + "epoch": 6.38425617078052, + "ref_ce_loss": 0.07519791275262833, + "step": 19140 + }, + { + "epoch": 6.3875917278185455, + "loss": 0.3443, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "grad_norm": 2.7326242923736572, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "learning_rate": 3.0840978455480466e-05, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "loss": 0.23642776906490326, + "step": 19150 + }, + { + "ce_loss": 0.02581688016653061, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "distill_loss": 0.1117728054523468, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "ref_ce_loss": 0.028648825362324715, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "loss": 0.29809409379959106, + "step": 19150 + }, + { + "ce_loss": 0.050406236201524734, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "distill_loss": 0.17046400904655457, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "ref_ce_loss": 0.05761267989873886, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "loss": 0.2136099636554718, + "step": 19150 + }, + { + "ce_loss": 0.008947102352976799, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "distill_loss": 0.16757823526859283, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "ref_ce_loss": 0.036926738917827606, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "loss": 0.420158714056015, + "step": 19150 + }, + { + "ce_loss": 0.026515640318393707, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "distill_loss": 0.22909250855445862, + "epoch": 6.3875917278185455, + "step": 19150 + }, + { + "epoch": 6.3875917278185455, + "ref_ce_loss": 0.03809387609362602, + "step": 19150 + }, + { + "epoch": 6.390927284856571, + "loss": 0.3354, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "grad_norm": 1.891335368156433, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "learning_rate": 3.071805062551638e-05, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "loss": 0.6429652571678162, + "step": 19160 + }, + { + "ce_loss": 0.08628448098897934, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "distill_loss": 0.21496467292308807, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "ref_ce_loss": 0.0815964788198471, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "loss": 0.3746568560600281, + "step": 19160 + }, + { + "ce_loss": 0.09126932173967361, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "distill_loss": 0.1949469894170761, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "ref_ce_loss": 0.0748150646686554, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "loss": 0.18672306835651398, + "step": 19160 + }, + { + "ce_loss": 0.041305914521217346, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "distill_loss": 0.09442579746246338, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "ref_ce_loss": 0.05088605359196663, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "loss": 0.43783873319625854, + "step": 19160 + }, + { + "ce_loss": 0.05301617830991745, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "distill_loss": 0.18607942759990692, + "epoch": 6.390927284856571, + "step": 19160 + }, + { + "epoch": 6.390927284856571, + "ref_ce_loss": 0.07295983284711838, + "step": 19160 + }, + { + "epoch": 6.394262841894596, + "loss": 0.3414, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "grad_norm": 4.8256096839904785, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "learning_rate": 3.0595340318775e-05, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "loss": 0.3920935094356537, + "step": 19170 + }, + { + "ce_loss": 0.0550379641354084, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "distill_loss": 0.22485697269439697, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "ref_ce_loss": 0.07897008210420609, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "loss": 0.2931138873100281, + "step": 19170 + }, + { + "ce_loss": 0.023997221142053604, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "distill_loss": 0.15474891662597656, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "ref_ce_loss": 0.04679024592041969, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "loss": 0.5692634582519531, + "step": 19170 + }, + { + "ce_loss": 0.07035572826862335, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "distill_loss": 0.14807996153831482, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "ref_ce_loss": 0.0663730576634407, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "loss": 0.34553074836730957, + "step": 19170 + }, + { + "ce_loss": 0.04271988570690155, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "distill_loss": 0.09763485938310623, + "epoch": 6.394262841894596, + "step": 19170 + }, + { + "epoch": 6.394262841894596, + "ref_ce_loss": 0.07433194667100906, + "step": 19170 + }, + { + "epoch": 6.3975983989326215, + "loss": 0.3648, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "grad_norm": 2.8879714012145996, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "learning_rate": 3.0472847759031644e-05, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "loss": 0.3636305630207062, + "step": 19180 + }, + { + "ce_loss": 0.028095953166484833, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "distill_loss": 0.23860491812229156, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "ref_ce_loss": 0.05020011588931084, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "loss": 0.6124520301818848, + "step": 19180 + }, + { + "ce_loss": 0.0461311973631382, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "distill_loss": 0.16430391371250153, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "ref_ce_loss": 0.059981171041727066, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "loss": 0.21800105273723602, + "step": 19180 + }, + { + "ce_loss": 0.02115524932742119, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "distill_loss": 0.12055207043886185, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "ref_ce_loss": 0.05713411420583725, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "loss": 0.16409829258918762, + "step": 19180 + }, + { + "ce_loss": 0.011136075481772423, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "distill_loss": 0.09124863147735596, + "epoch": 6.3975983989326215, + "step": 19180 + }, + { + "epoch": 6.3975983989326215, + "ref_ce_loss": 0.02289462462067604, + "step": 19180 + }, + { + "epoch": 6.400933955970647, + "loss": 0.2937, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "grad_norm": 3.5310378074645996, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "learning_rate": 3.03505731696643e-05, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "loss": 0.2121621072292328, + "step": 19190 + }, + { + "ce_loss": 0.023766357451677322, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "distill_loss": 0.09208349883556366, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "ref_ce_loss": 0.042189568281173706, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "loss": 0.6151399612426758, + "step": 19190 + }, + { + "ce_loss": 0.0640786662697792, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "distill_loss": 0.13085195422172546, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "ref_ce_loss": 0.04363051429390907, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "loss": 0.2088690996170044, + "step": 19190 + }, + { + "ce_loss": 0.02814677730202675, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "distill_loss": 0.11532458662986755, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "ref_ce_loss": 0.06498730182647705, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "loss": 0.2948390543460846, + "step": 19190 + }, + { + "ce_loss": 0.05973248556256294, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "distill_loss": 0.14913040399551392, + "epoch": 6.400933955970647, + "step": 19190 + }, + { + "epoch": 6.400933955970647, + "ref_ce_loss": 0.0471370592713356, + "step": 19190 + }, + { + "epoch": 6.404269513008672, + "loss": 0.3206, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "grad_norm": 2.9538049697875977, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "learning_rate": 3.0228516773653623e-05, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "loss": 0.2782128155231476, + "step": 19200 + }, + { + "ce_loss": 0.033618733286857605, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "distill_loss": 0.13662630319595337, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "ref_ce_loss": 0.0756227895617485, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "loss": 0.2780166566371918, + "step": 19200 + }, + { + "ce_loss": 0.036082640290260315, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "distill_loss": 0.13568821549415588, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "ref_ce_loss": 0.050824616104364395, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "loss": 0.3180280327796936, + "step": 19200 + }, + { + "ce_loss": 0.022766657173633575, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "distill_loss": 0.18709643185138702, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "ref_ce_loss": 0.047577161341905594, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "loss": 0.29317015409469604, + "step": 19200 + }, + { + "ce_loss": 0.02827540971338749, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "distill_loss": 0.12621057033538818, + "epoch": 6.404269513008672, + "step": 19200 + }, + { + "epoch": 6.404269513008672, + "ref_ce_loss": 0.06611227989196777, + "step": 19200 + }, + { + "epoch": 6.4076050700466975, + "loss": 0.3585, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "grad_norm": 2.182614803314209, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "learning_rate": 3.0106678793582428e-05, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "loss": 0.3438469171524048, + "step": 19210 + }, + { + "ce_loss": 0.03777853399515152, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "distill_loss": 0.13100846111774445, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "ref_ce_loss": 0.06402308493852615, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "loss": 0.15743282437324524, + "step": 19210 + }, + { + "ce_loss": 0.01302177831530571, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "distill_loss": 0.07181822508573532, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "ref_ce_loss": 0.03954114392399788, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "loss": 0.3400402367115021, + "step": 19210 + }, + { + "ce_loss": 0.037813831120729446, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "distill_loss": 0.18940778076648712, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "ref_ce_loss": 0.08064355701208115, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "loss": 0.3088165819644928, + "step": 19210 + }, + { + "ce_loss": 0.0556054562330246, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "distill_loss": 0.1560615450143814, + "epoch": 6.4076050700466975, + "step": 19210 + }, + { + "epoch": 6.4076050700466975, + "ref_ce_loss": 0.06648774445056915, + "step": 19210 + }, + { + "epoch": 6.410940627084723, + "loss": 0.3273, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "grad_norm": 4.7798004150390625, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "learning_rate": 2.9985059451635023e-05, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "loss": 0.35199153423309326, + "step": 19220 + }, + { + "ce_loss": 0.07559584826231003, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "distill_loss": 0.12564608454704285, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "ref_ce_loss": 0.07979488372802734, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "loss": 0.5481826066970825, + "step": 19220 + }, + { + "ce_loss": 0.07940838485956192, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "distill_loss": 0.25284120440483093, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "ref_ce_loss": 0.09138768911361694, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "loss": 0.3082050085067749, + "step": 19220 + }, + { + "ce_loss": 0.05846491456031799, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "distill_loss": 0.11728785932064056, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "ref_ce_loss": 0.06966432929039001, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "loss": 0.2505358159542084, + "step": 19220 + }, + { + "ce_loss": 0.05416925996541977, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "distill_loss": 0.12237006425857544, + "epoch": 6.410940627084723, + "step": 19220 + }, + { + "epoch": 6.410940627084723, + "ref_ce_loss": 0.043212343007326126, + "step": 19220 + }, + { + "epoch": 6.414276184122748, + "loss": 0.3379, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "grad_norm": 2.7173337936401367, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "learning_rate": 2.986365896959715e-05, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "loss": 0.5793491005897522, + "step": 19230 + }, + { + "ce_loss": 0.0702369213104248, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "distill_loss": 0.20983470976352692, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "ref_ce_loss": 0.09685465693473816, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "loss": 0.26481112837791443, + "step": 19230 + }, + { + "ce_loss": 0.043139562010765076, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "distill_loss": 0.14797475934028625, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "ref_ce_loss": 0.055984094738960266, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "loss": 0.33620429039001465, + "step": 19230 + }, + { + "ce_loss": 0.024415483698248863, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "distill_loss": 0.18126405775547028, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "ref_ce_loss": 0.09445316344499588, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "loss": 0.20489488542079926, + "step": 19230 + }, + { + "ce_loss": 0.011756706051528454, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "distill_loss": 0.08259446918964386, + "epoch": 6.414276184122748, + "step": 19230 + }, + { + "epoch": 6.414276184122748, + "ref_ce_loss": 0.045302245765924454, + "step": 19230 + }, + { + "epoch": 6.417611741160774, + "loss": 0.349, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "grad_norm": 2.606694459915161, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "learning_rate": 2.9742477568855427e-05, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "loss": 0.33460116386413574, + "step": 19240 + }, + { + "ce_loss": 0.019410649314522743, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "distill_loss": 0.11360645294189453, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "ref_ce_loss": 0.059435781091451645, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "loss": 0.2454586774110794, + "step": 19240 + }, + { + "ce_loss": 0.03153756633400917, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "distill_loss": 0.142329141497612, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "ref_ce_loss": 0.0708671510219574, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "loss": 0.23517194390296936, + "step": 19240 + }, + { + "ce_loss": 0.013855576515197754, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "distill_loss": 0.08239498734474182, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "ref_ce_loss": 0.03543057292699814, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "loss": 0.4140257239341736, + "step": 19240 + }, + { + "ce_loss": 0.05405563861131668, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "distill_loss": 0.23824673891067505, + "epoch": 6.417611741160774, + "step": 19240 + }, + { + "epoch": 6.417611741160774, + "ref_ce_loss": 0.051883019506931305, + "step": 19240 + }, + { + "epoch": 6.420947298198799, + "loss": 0.3119, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "grad_norm": 3.075127601623535, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "learning_rate": 2.9621515470396873e-05, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "loss": 0.38381344079971313, + "step": 19250 + }, + { + "ce_loss": 0.02055729180574417, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "distill_loss": 0.1190321072936058, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "ref_ce_loss": 0.04908378794789314, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "loss": 0.30875587463378906, + "step": 19250 + }, + { + "ce_loss": 0.00966921728104353, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "distill_loss": 0.11165569722652435, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "ref_ce_loss": 0.039333853870630264, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "loss": 0.13892872631549835, + "step": 19250 + }, + { + "ce_loss": 0.01093357801437378, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "distill_loss": 0.08393251895904541, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "ref_ce_loss": 0.043848972767591476, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "loss": 0.2550225257873535, + "step": 19250 + }, + { + "ce_loss": 0.03890252485871315, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "distill_loss": 0.10823406279087067, + "epoch": 6.420947298198799, + "step": 19250 + }, + { + "epoch": 6.420947298198799, + "ref_ce_loss": 0.04668232426047325, + "step": 19250 + }, + { + "epoch": 6.424282855236824, + "loss": 0.3383, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "grad_norm": 2.459117889404297, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "learning_rate": 2.950077289480865e-05, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "loss": 0.19606022536754608, + "step": 19260 + }, + { + "ce_loss": 0.023685600608587265, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "distill_loss": 0.10593249648809433, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "ref_ce_loss": 0.04287222400307655, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "loss": 0.23405814170837402, + "step": 19260 + }, + { + "ce_loss": 0.05165858939290047, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "distill_loss": 0.10588884353637695, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "ref_ce_loss": 0.05655386298894882, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "loss": 0.34876590967178345, + "step": 19260 + }, + { + "ce_loss": 0.080736443400383, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "distill_loss": 0.13183307647705078, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "ref_ce_loss": 0.0751199871301651, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "loss": 0.30815309286117554, + "step": 19260 + }, + { + "ce_loss": 0.05185553804039955, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "distill_loss": 0.1449497938156128, + "epoch": 6.424282855236824, + "step": 19260 + }, + { + "epoch": 6.424282855236824, + "ref_ce_loss": 0.08047346770763397, + "step": 19260 + }, + { + "epoch": 6.42761841227485, + "loss": 0.3324, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "grad_norm": 3.1468026638031006, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "learning_rate": 2.938025006227761e-05, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "loss": 0.3166249394416809, + "step": 19270 + }, + { + "ce_loss": 0.043620720505714417, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "distill_loss": 0.23102976381778717, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "ref_ce_loss": 0.041840847581624985, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "loss": 0.3161732852458954, + "step": 19270 + }, + { + "ce_loss": 0.0137014864012599, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "distill_loss": 0.18281744420528412, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "ref_ce_loss": 0.06067816540598869, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "loss": 0.26537391543388367, + "step": 19270 + }, + { + "ce_loss": 0.044217947870492935, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "distill_loss": 0.13335096836090088, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "ref_ce_loss": 0.06993230432271957, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "loss": 0.16318552196025848, + "step": 19270 + }, + { + "ce_loss": 0.010892470367252827, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "distill_loss": 0.10559059679508209, + "epoch": 6.42761841227485, + "step": 19270 + }, + { + "epoch": 6.42761841227485, + "ref_ce_loss": 0.02419205941259861, + "step": 19270 + }, + { + "epoch": 6.430953969312875, + "loss": 0.3088, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "grad_norm": 2.171274185180664, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "learning_rate": 2.9259947192589843e-05, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "loss": 0.2121484875679016, + "step": 19280 + }, + { + "ce_loss": 0.042485639452934265, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "distill_loss": 0.09462635964155197, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "ref_ce_loss": 0.052952077239751816, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "loss": 0.34515848755836487, + "step": 19280 + }, + { + "ce_loss": 0.08936353772878647, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "distill_loss": 0.17028677463531494, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "ref_ce_loss": 0.07250023633241653, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "loss": 0.28915169835090637, + "step": 19280 + }, + { + "ce_loss": 0.04281798377633095, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "distill_loss": 0.1258801370859146, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "ref_ce_loss": 0.055673930794000626, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "loss": 0.37340790033340454, + "step": 19280 + }, + { + "ce_loss": 0.07182245701551437, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "distill_loss": 0.20456330478191376, + "epoch": 6.430953969312875, + "step": 19280 + }, + { + "epoch": 6.430953969312875, + "ref_ce_loss": 0.05647760257124901, + "step": 19280 + }, + { + "epoch": 6.4342895263509, + "loss": 0.3328, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "grad_norm": 2.09809947013855, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "learning_rate": 2.913986450513036e-05, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "loss": 0.5191903114318848, + "step": 19290 + }, + { + "ce_loss": 0.09405925869941711, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "distill_loss": 0.32466500997543335, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "ref_ce_loss": 0.07379074394702911, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "loss": 0.3605400323867798, + "step": 19290 + }, + { + "ce_loss": 0.0426713228225708, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "distill_loss": 0.08181039243936539, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "ref_ce_loss": 0.06672769784927368, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "loss": 0.2661738991737366, + "step": 19290 + }, + { + "ce_loss": 0.0049125500954687595, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "distill_loss": 0.20455636084079742, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "ref_ce_loss": 0.03672386705875397, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "loss": 0.234100803732872, + "step": 19290 + }, + { + "ce_loss": 0.03455338999629021, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "distill_loss": 0.12428376823663712, + "epoch": 6.4342895263509, + "step": 19290 + }, + { + "epoch": 6.4342895263509, + "ref_ce_loss": 0.05475219711661339, + "step": 19290 + }, + { + "epoch": 6.437625083388926, + "loss": 0.3237, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "grad_norm": 2.9320175647735596, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "learning_rate": 2.902000221888256e-05, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "loss": 0.29273325204849243, + "step": 19300 + }, + { + "ce_loss": 0.03256535902619362, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "distill_loss": 0.151731476187706, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "ref_ce_loss": 0.0654408186674118, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "loss": 0.22853368520736694, + "step": 19300 + }, + { + "ce_loss": 0.05244693532586098, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "distill_loss": 0.13937723636627197, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "ref_ce_loss": 0.03658019378781319, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "loss": 0.18904545903205872, + "step": 19300 + }, + { + "ce_loss": 0.021132905036211014, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "distill_loss": 0.11984669417142868, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "ref_ce_loss": 0.04028644412755966, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "loss": 0.473442018032074, + "step": 19300 + }, + { + "ce_loss": 0.04387342929840088, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "distill_loss": 0.21055030822753906, + "epoch": 6.437625083388926, + "step": 19300 + }, + { + "epoch": 6.437625083388926, + "ref_ce_loss": 0.0811781957745552, + "step": 19300 + }, + { + "epoch": 6.440960640426951, + "loss": 0.3371, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "grad_norm": 2.3369381427764893, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "learning_rate": 2.890036055242801e-05, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "loss": 0.25813591480255127, + "step": 19310 + }, + { + "ce_loss": 0.06117769703269005, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "distill_loss": 0.134428933262825, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "ref_ce_loss": 0.06232694536447525, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "loss": 0.19962909817695618, + "step": 19310 + }, + { + "ce_loss": 0.023090876638889313, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "distill_loss": 0.10459668934345245, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "ref_ce_loss": 0.07150375843048096, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "loss": 0.33547577261924744, + "step": 19310 + }, + { + "ce_loss": 0.05752252787351608, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "distill_loss": 0.21133238077163696, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "ref_ce_loss": 0.05103033781051636, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "loss": 0.35321956872940063, + "step": 19310 + }, + { + "ce_loss": 0.04094832018017769, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "distill_loss": 0.19313184916973114, + "epoch": 6.440960640426951, + "step": 19310 + }, + { + "epoch": 6.440960640426951, + "ref_ce_loss": 0.049679603427648544, + "step": 19310 + }, + { + "epoch": 6.444296197464976, + "loss": 0.3082, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "grad_norm": 3.1166648864746094, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "learning_rate": 2.8780939723945884e-05, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "loss": 0.4541691541671753, + "step": 19320 + }, + { + "ce_loss": 0.02630157396197319, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "distill_loss": 0.23176150023937225, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "ref_ce_loss": 0.08258750289678574, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "loss": 0.20466068387031555, + "step": 19320 + }, + { + "ce_loss": 0.03209497034549713, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "distill_loss": 0.10391910374164581, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "ref_ce_loss": 0.04612681642174721, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "loss": 0.34781479835510254, + "step": 19320 + }, + { + "ce_loss": 0.06227179616689682, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "distill_loss": 0.16145853698253632, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "ref_ce_loss": 0.055882956832647324, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "loss": 0.2700772285461426, + "step": 19320 + }, + { + "ce_loss": 0.02936231903731823, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "distill_loss": 0.14836882054805756, + "epoch": 6.444296197464976, + "step": 19320 + }, + { + "epoch": 6.444296197464976, + "ref_ce_loss": 0.05383678898215294, + "step": 19320 + }, + { + "epoch": 6.447631754503002, + "loss": 0.3323, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "grad_norm": 3.125654458999634, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "learning_rate": 2.8661739951212698e-05, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "loss": 0.4950518012046814, + "step": 19330 + }, + { + "ce_loss": 0.0657888725399971, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "distill_loss": 0.11284708976745605, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "ref_ce_loss": 0.0673019289970398, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "loss": 0.2192465364933014, + "step": 19330 + }, + { + "ce_loss": 0.033647581934928894, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "distill_loss": 0.14369481801986694, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "ref_ce_loss": 0.041760560125112534, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "loss": 0.20164823532104492, + "step": 19330 + }, + { + "ce_loss": 0.02778281643986702, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "distill_loss": 0.12590767443180084, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "ref_ce_loss": 0.047779470682144165, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "loss": 0.2597501575946808, + "step": 19330 + }, + { + "ce_loss": 0.01448689866811037, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "distill_loss": 0.14149931073188782, + "epoch": 6.447631754503002, + "step": 19330 + }, + { + "epoch": 6.447631754503002, + "ref_ce_loss": 0.061557453125715256, + "step": 19330 + }, + { + "epoch": 6.450967311541027, + "loss": 0.3349, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "grad_norm": 2.500840902328491, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "learning_rate": 2.8542761451601837e-05, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "loss": 0.2467081993818283, + "step": 19340 + }, + { + "ce_loss": 0.01818380504846573, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "distill_loss": 0.16112403571605682, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "ref_ce_loss": 0.06723609566688538, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "loss": 0.4431634545326233, + "step": 19340 + }, + { + "ce_loss": 0.05379560962319374, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "distill_loss": 0.19823703169822693, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "ref_ce_loss": 0.07378227263689041, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "loss": 0.7392683625221252, + "step": 19340 + }, + { + "ce_loss": 0.03509441763162613, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "distill_loss": 0.17824314534664154, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "ref_ce_loss": 0.06962034851312637, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "loss": 0.6556627750396729, + "step": 19340 + }, + { + "ce_loss": 0.11523985862731934, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "distill_loss": 0.17302967607975006, + "epoch": 6.450967311541027, + "step": 19340 + }, + { + "epoch": 6.450967311541027, + "ref_ce_loss": 0.06542285531759262, + "step": 19340 + }, + { + "epoch": 6.454302868579052, + "loss": 0.3345, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "grad_norm": 2.523237943649292, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "learning_rate": 2.8424004442083075e-05, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "loss": 0.42306017875671387, + "step": 19350 + }, + { + "ce_loss": 0.05187192186713219, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "distill_loss": 0.16845226287841797, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "ref_ce_loss": 0.06847023963928223, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "loss": 0.2361203134059906, + "step": 19350 + }, + { + "ce_loss": 0.06244039162993431, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "distill_loss": 0.1332026571035385, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "ref_ce_loss": 0.040215637534856796, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "loss": 0.2239590734243393, + "step": 19350 + }, + { + "ce_loss": 0.0467652902007103, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "distill_loss": 0.09530991315841675, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "ref_ce_loss": 0.05247754231095314, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "loss": 0.24619795382022858, + "step": 19350 + }, + { + "ce_loss": 0.0250222310423851, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "distill_loss": 0.12926128506660461, + "epoch": 6.454302868579052, + "step": 19350 + }, + { + "epoch": 6.454302868579052, + "ref_ce_loss": 0.0656496062874794, + "step": 19350 + }, + { + "epoch": 6.457638425617078, + "loss": 0.3546, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "grad_norm": 2.459033250808716, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "learning_rate": 2.8305469139222398e-05, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "loss": 0.34734630584716797, + "step": 19360 + }, + { + "ce_loss": 0.03935239836573601, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "distill_loss": 0.15108200907707214, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "ref_ce_loss": 0.0873950719833374, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "loss": 0.24328087270259857, + "step": 19360 + }, + { + "ce_loss": 0.026929683983325958, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "distill_loss": 0.09555982053279877, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "ref_ce_loss": 0.07317261397838593, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "loss": 0.2929458022117615, + "step": 19360 + }, + { + "ce_loss": 0.042498935014009476, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "distill_loss": 0.16249480843544006, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "ref_ce_loss": 0.04562801495194435, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "loss": 0.2420826107263565, + "step": 19360 + }, + { + "ce_loss": 0.027378996834158897, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "distill_loss": 0.14949826896190643, + "epoch": 6.457638425617078, + "step": 19360 + }, + { + "epoch": 6.457638425617078, + "ref_ce_loss": 0.06513167917728424, + "step": 19360 + }, + { + "epoch": 6.460973982655103, + "loss": 0.3376, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "grad_norm": 2.392075538635254, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "learning_rate": 2.8187155759181425e-05, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "loss": 0.12967228889465332, + "step": 19370 + }, + { + "ce_loss": 0.013152679428458214, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "distill_loss": 0.08467573672533035, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "ref_ce_loss": 0.03173118457198143, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "loss": 0.33118006587028503, + "step": 19370 + }, + { + "ce_loss": 0.08657965064048767, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "distill_loss": 0.17172375321388245, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "ref_ce_loss": 0.05565835162997246, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "loss": 0.23161426186561584, + "step": 19370 + }, + { + "ce_loss": 0.029349392279982567, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "distill_loss": 0.11940692365169525, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "ref_ce_loss": 0.06689442694187164, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "loss": 0.19995811581611633, + "step": 19370 + }, + { + "ce_loss": 0.004559206310659647, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "distill_loss": 0.095821812748909, + "epoch": 6.460973982655103, + "step": 19370 + }, + { + "epoch": 6.460973982655103, + "ref_ce_loss": 0.03864465653896332, + "step": 19370 + }, + { + "epoch": 6.4643095396931285, + "loss": 0.2942, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "grad_norm": 2.8001151084899902, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "learning_rate": 2.8069064517717115e-05, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "loss": 0.564293384552002, + "step": 19380 + }, + { + "ce_loss": 0.0951477438211441, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "distill_loss": 0.16672182083129883, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "ref_ce_loss": 0.10000166296958923, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "loss": 0.21771246194839478, + "step": 19380 + }, + { + "ce_loss": 0.019646715372800827, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "distill_loss": 0.14466574788093567, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "ref_ce_loss": 0.05324574559926987, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "loss": 0.1648833155632019, + "step": 19380 + }, + { + "ce_loss": 0.0035948504228144884, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "distill_loss": 0.10792934894561768, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "ref_ce_loss": 0.034741222858428955, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "loss": 0.11709931492805481, + "step": 19380 + }, + { + "ce_loss": 0.0012851167703047395, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "distill_loss": 0.07367859780788422, + "epoch": 6.4643095396931285, + "step": 19380 + }, + { + "epoch": 6.4643095396931285, + "ref_ce_loss": 0.02455708011984825, + "step": 19380 + }, + { + "epoch": 6.467645096731154, + "loss": 0.3228, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "grad_norm": 4.402524948120117, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "learning_rate": 2.795119563018133e-05, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "loss": 0.42270222306251526, + "step": 19390 + }, + { + "ce_loss": 0.10031957924365997, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "distill_loss": 0.1878599226474762, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "ref_ce_loss": 0.10024359822273254, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "loss": 0.17374208569526672, + "step": 19390 + }, + { + "ce_loss": 0.027342218905687332, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "distill_loss": 0.11525329202413559, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "ref_ce_loss": 0.024801230058073997, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "loss": 0.716946542263031, + "step": 19390 + }, + { + "ce_loss": 0.07414887100458145, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "distill_loss": 0.18353484570980072, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "ref_ce_loss": 0.04698779433965683, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "loss": 0.21807637810707092, + "step": 19390 + }, + { + "ce_loss": 0.05198301002383232, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "distill_loss": 0.11755422502756119, + "epoch": 6.467645096731154, + "step": 19390 + }, + { + "epoch": 6.467645096731154, + "ref_ce_loss": 0.03329809382557869, + "step": 19390 + }, + { + "epoch": 6.470980653769179, + "loss": 0.3604, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "grad_norm": 2.1826651096343994, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "learning_rate": 2.7833549311520352e-05, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "loss": 0.4379744827747345, + "step": 19400 + }, + { + "ce_loss": 0.03534636273980141, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "distill_loss": 0.1718473583459854, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "ref_ce_loss": 0.08096379786729813, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "loss": 0.36185845732688904, + "step": 19400 + }, + { + "ce_loss": 0.05027947574853897, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "distill_loss": 0.1847555935382843, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "ref_ce_loss": 0.07357639074325562, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "loss": 0.27249574661254883, + "step": 19400 + }, + { + "ce_loss": 0.0771196261048317, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "distill_loss": 0.10766123235225677, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "ref_ce_loss": 0.04266400635242462, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "loss": 0.26876991987228394, + "step": 19400 + }, + { + "ce_loss": 0.019434722140431404, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "distill_loss": 0.19553104043006897, + "epoch": 6.470980653769179, + "step": 19400 + }, + { + "epoch": 6.470980653769179, + "ref_ce_loss": 0.03554742410778999, + "step": 19400 + }, + { + "epoch": 6.4743162108072045, + "loss": 0.3295, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "grad_norm": 2.4636645317077637, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "learning_rate": 2.7716125776274694e-05, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "loss": 0.393093079328537, + "step": 19410 + }, + { + "ce_loss": 0.07405968010425568, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "distill_loss": 0.13686689734458923, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "ref_ce_loss": 0.07957617193460464, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "loss": 0.25447648763656616, + "step": 19410 + }, + { + "ce_loss": 0.022265994921326637, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "distill_loss": 0.1449824571609497, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "ref_ce_loss": 0.07170706242322922, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "loss": 0.3648896813392639, + "step": 19410 + }, + { + "ce_loss": 0.07455446571111679, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "distill_loss": 0.1877792775630951, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "ref_ce_loss": 0.06565703451633453, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "loss": 0.4845498204231262, + "step": 19410 + }, + { + "ce_loss": 0.052607208490371704, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "distill_loss": 0.19217851758003235, + "epoch": 6.4743162108072045, + "step": 19410 + }, + { + "epoch": 6.4743162108072045, + "ref_ce_loss": 0.05669070780277252, + "step": 19410 + }, + { + "epoch": 6.47765176784523, + "loss": 0.344, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "grad_norm": 2.88173508644104, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "learning_rate": 2.759892523857858e-05, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "loss": 0.22536416351795197, + "step": 19420 + }, + { + "ce_loss": 0.039910074323415756, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "distill_loss": 0.11129461973905563, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "ref_ce_loss": 0.05070248246192932, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "loss": 0.254660040140152, + "step": 19420 + }, + { + "ce_loss": 0.03568845987319946, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "distill_loss": 0.15099768340587616, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "ref_ce_loss": 0.06769350916147232, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "loss": 0.6331043243408203, + "step": 19420 + }, + { + "ce_loss": 0.04798339679837227, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "distill_loss": 0.17899511754512787, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "ref_ce_loss": 0.06974631547927856, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "loss": 0.2030748426914215, + "step": 19420 + }, + { + "ce_loss": 0.03313259407877922, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "distill_loss": 0.10403706133365631, + "epoch": 6.47765176784523, + "step": 19420 + }, + { + "epoch": 6.47765176784523, + "ref_ce_loss": 0.03035002388060093, + "step": 19420 + }, + { + "epoch": 6.480987324883255, + "loss": 0.3198, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "grad_norm": 4.207744121551514, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "learning_rate": 2.7481947912159542e-05, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "loss": 0.18555955588817596, + "step": 19430 + }, + { + "ce_loss": 0.014027750119566917, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "distill_loss": 0.1059509664773941, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "ref_ce_loss": 0.041501760482788086, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "loss": 0.5299850106239319, + "step": 19430 + }, + { + "ce_loss": 0.11079461127519608, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "distill_loss": 0.24580931663513184, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "ref_ce_loss": 0.056581251323223114, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "loss": 0.16338509321212769, + "step": 19430 + }, + { + "ce_loss": 0.031098000705242157, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "distill_loss": 0.09898979216814041, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "ref_ce_loss": 0.03305355831980705, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "loss": 0.3786182403564453, + "step": 19430 + }, + { + "ce_loss": 0.06183497980237007, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "distill_loss": 0.18642447888851166, + "epoch": 6.480987324883255, + "step": 19430 + }, + { + "epoch": 6.480987324883255, + "ref_ce_loss": 0.05898001417517662, + "step": 19430 + }, + { + "epoch": 6.484322881921281, + "loss": 0.2907, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "grad_norm": 5.101585865020752, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "learning_rate": 2.7365194010338126e-05, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "loss": 0.3683335781097412, + "step": 19440 + }, + { + "ce_loss": 0.017139267176389694, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "distill_loss": 0.17401042580604553, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "ref_ce_loss": 0.05724998190999031, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "loss": 0.19354148209095, + "step": 19440 + }, + { + "ce_loss": 0.0053457519970834255, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "distill_loss": 0.1339099109172821, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "ref_ce_loss": 0.0540962778031826, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "loss": 0.26310113072395325, + "step": 19440 + }, + { + "ce_loss": 0.047900937497615814, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "distill_loss": 0.12943311035633087, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "ref_ce_loss": 0.05395814776420593, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "loss": 0.21164949238300323, + "step": 19440 + }, + { + "ce_loss": 0.019808808341622353, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "distill_loss": 0.10979584604501724, + "epoch": 6.484322881921281, + "step": 19440 + }, + { + "epoch": 6.484322881921281, + "ref_ce_loss": 0.052284590899944305, + "step": 19440 + }, + { + "epoch": 6.487658438959306, + "loss": 0.3068, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "grad_norm": 2.3882312774658203, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "learning_rate": 2.7248663746027305e-05, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "loss": 0.26894763112068176, + "step": 19450 + }, + { + "ce_loss": 0.02126028575003147, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "distill_loss": 0.10786668211221695, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "ref_ce_loss": 0.058729927986860275, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "loss": 0.38041186332702637, + "step": 19450 + }, + { + "ce_loss": 0.035523854196071625, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "distill_loss": 0.15720032155513763, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "ref_ce_loss": 0.06419675052165985, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "loss": 0.39652425050735474, + "step": 19450 + }, + { + "ce_loss": 0.05832938849925995, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "distill_loss": 0.15557998418807983, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "ref_ce_loss": 0.07637973874807358, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "loss": 0.20689250528812408, + "step": 19450 + }, + { + "ce_loss": 0.012813769280910492, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "distill_loss": 0.15325000882148743, + "epoch": 6.487658438959306, + "step": 19450 + }, + { + "epoch": 6.487658438959306, + "ref_ce_loss": 0.04054896533489227, + "step": 19450 + }, + { + "epoch": 6.490993995997331, + "loss": 0.3084, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "grad_norm": 3.626648426055908, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "learning_rate": 2.7132357331732356e-05, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "loss": 0.15348277986049652, + "step": 19460 + }, + { + "ce_loss": 0.01779790408909321, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "distill_loss": 0.1153775304555893, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "ref_ce_loss": 0.020220749080181122, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "loss": 0.22395391762256622, + "step": 19460 + }, + { + "ce_loss": 0.046976491808891296, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "distill_loss": 0.11234377324581146, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "ref_ce_loss": 0.06450348347425461, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "loss": 0.44756627082824707, + "step": 19460 + }, + { + "ce_loss": 0.09042006731033325, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "distill_loss": 0.2449221909046173, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "ref_ce_loss": 0.080929696559906, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "loss": 0.2803463935852051, + "step": 19460 + }, + { + "ce_loss": 0.04512576386332512, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "distill_loss": 0.10338424146175385, + "epoch": 6.490993995997331, + "step": 19460 + }, + { + "epoch": 6.490993995997331, + "ref_ce_loss": 0.0502689890563488, + "step": 19460 + }, + { + "epoch": 6.494329553035357, + "loss": 0.3017, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "grad_norm": 1.9026871919631958, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "learning_rate": 2.7016274979550357e-05, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "loss": 0.45376911759376526, + "step": 19470 + }, + { + "ce_loss": 0.07573352009057999, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "distill_loss": 0.17655305564403534, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "ref_ce_loss": 0.05633631721138954, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "loss": 0.23756596446037292, + "step": 19470 + }, + { + "ce_loss": 0.040350113064050674, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "distill_loss": 0.11733220517635345, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "ref_ce_loss": 0.0678890123963356, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "loss": 0.23397405445575714, + "step": 19470 + }, + { + "ce_loss": 0.009241566061973572, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "distill_loss": 0.13413383066654205, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "ref_ce_loss": 0.04893755912780762, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "loss": 0.4001774191856384, + "step": 19470 + }, + { + "ce_loss": 0.01755928434431553, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "distill_loss": 0.2129809856414795, + "epoch": 6.494329553035357, + "step": 19470 + }, + { + "epoch": 6.494329553035357, + "ref_ce_loss": 0.05770876631140709, + "step": 19470 + }, + { + "epoch": 6.497665110073382, + "loss": 0.2931, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "grad_norm": 2.623253345489502, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "learning_rate": 2.6900416901169586e-05, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "loss": 0.3188818693161011, + "step": 19480 + }, + { + "ce_loss": 0.08621389418840408, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "distill_loss": 0.16553746163845062, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "ref_ce_loss": 0.04960956051945686, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "loss": 0.21584922075271606, + "step": 19480 + }, + { + "ce_loss": 0.0591447688639164, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "distill_loss": 0.11659364402294159, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "ref_ce_loss": 0.04008140414953232, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "loss": 0.15966036915779114, + "step": 19480 + }, + { + "ce_loss": 0.02307932637631893, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "distill_loss": 0.10009153187274933, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "ref_ce_loss": 0.03619556874036789, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "loss": 0.3482780456542969, + "step": 19480 + }, + { + "ce_loss": 0.08321208506822586, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "distill_loss": 0.14295196533203125, + "epoch": 6.497665110073382, + "step": 19480 + }, + { + "epoch": 6.497665110073382, + "ref_ce_loss": 0.09516530483961105, + "step": 19480 + }, + { + "epoch": 6.501000667111407, + "loss": 0.2853, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "grad_norm": 2.0386288166046143, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "learning_rate": 2.6784783307869624e-05, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "loss": 0.3476499319076538, + "step": 19490 + }, + { + "ce_loss": 0.03272515535354614, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "distill_loss": 0.179308220744133, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "ref_ce_loss": 0.09251862019300461, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "loss": 0.3680034279823303, + "step": 19490 + }, + { + "ce_loss": 0.06351684033870697, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "distill_loss": 0.16472645103931427, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "ref_ce_loss": 0.06539975851774216, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "loss": 0.5050325989723206, + "step": 19490 + }, + { + "ce_loss": 0.051710743457078934, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "distill_loss": 0.28480255603790283, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "ref_ce_loss": 0.07189995795488358, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "loss": 0.2924560010433197, + "step": 19490 + }, + { + "ce_loss": 0.0532911941409111, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "distill_loss": 0.13427993655204773, + "epoch": 6.501000667111407, + "step": 19490 + }, + { + "epoch": 6.501000667111407, + "ref_ce_loss": 0.06003837287425995, + "step": 19490 + }, + { + "epoch": 6.504336224149433, + "loss": 0.3273, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "grad_norm": 3.2773895263671875, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "learning_rate": 2.666937441052049e-05, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "loss": 0.31917792558670044, + "step": 19500 + }, + { + "ce_loss": 0.06857472658157349, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "distill_loss": 0.14248031377792358, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "ref_ce_loss": 0.061197735369205475, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "loss": 0.2877909541130066, + "step": 19500 + }, + { + "ce_loss": 0.03194195777177811, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "distill_loss": 0.11442811787128448, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "ref_ce_loss": 0.049518194049596786, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "loss": 0.20438994467258453, + "step": 19500 + }, + { + "ce_loss": 0.005468576215207577, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "distill_loss": 0.15710853040218353, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "ref_ce_loss": 0.04172136262059212, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "loss": 0.4296785593032837, + "step": 19500 + }, + { + "ce_loss": 0.11481533944606781, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "distill_loss": 0.17444954812526703, + "epoch": 6.504336224149433, + "step": 19500 + }, + { + "epoch": 6.504336224149433, + "ref_ce_loss": 0.09351933747529984, + "step": 19500 + }, + { + "epoch": 6.507671781187458, + "loss": 0.3024, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "grad_norm": 2.5083553791046143, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "learning_rate": 2.6554190419582432e-05, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "loss": 0.22696717083454132, + "step": 19510 + }, + { + "ce_loss": 0.024120118468999863, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "distill_loss": 0.118647001683712, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "ref_ce_loss": 0.05346282944083214, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "loss": 0.42250338196754456, + "step": 19510 + }, + { + "ce_loss": 0.04991762340068817, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "distill_loss": 0.2798449993133545, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "ref_ce_loss": 0.06090565398335457, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "loss": 0.21890859305858612, + "step": 19510 + }, + { + "ce_loss": 0.04088882729411125, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "distill_loss": 0.13536345958709717, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "ref_ce_loss": 0.042482659220695496, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "loss": 0.48329630494117737, + "step": 19510 + }, + { + "ce_loss": 0.08375865966081619, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "distill_loss": 0.18079915642738342, + "epoch": 6.507671781187458, + "step": 19510 + }, + { + "epoch": 6.507671781187458, + "ref_ce_loss": 0.05869739502668381, + "step": 19510 + }, + { + "epoch": 6.511007338225483, + "loss": 0.3078, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "grad_norm": 1.8151508569717407, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "learning_rate": 2.64392315451057e-05, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "loss": 0.20219141244888306, + "step": 19520 + }, + { + "ce_loss": 0.02228640951216221, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "distill_loss": 0.1161341741681099, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "ref_ce_loss": 0.035142045468091965, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "loss": 0.3143247663974762, + "step": 19520 + }, + { + "ce_loss": 0.05089471861720085, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "distill_loss": 0.18314094841480255, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "ref_ce_loss": 0.061314839869737625, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "loss": 0.4873974025249481, + "step": 19520 + }, + { + "ce_loss": 0.0976187065243721, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "distill_loss": 0.2322872132062912, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "ref_ce_loss": 0.06849783658981323, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "loss": 0.23710933327674866, + "step": 19520 + }, + { + "ce_loss": 0.04801061376929283, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "distill_loss": 0.10981900990009308, + "epoch": 6.511007338225483, + "step": 19520 + }, + { + "epoch": 6.511007338225483, + "ref_ce_loss": 0.06101440265774727, + "step": 19520 + }, + { + "epoch": 6.514342895263509, + "loss": 0.3379, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "grad_norm": 2.442229986190796, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "learning_rate": 2.6324497996729826e-05, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "loss": 0.36532866954803467, + "step": 19530 + }, + { + "ce_loss": 0.04357844591140747, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "distill_loss": 0.1568370908498764, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "ref_ce_loss": 0.0767655000090599, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "loss": 0.5425922274589539, + "step": 19530 + }, + { + "ce_loss": 0.005232425406575203, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "distill_loss": 0.15154841542243958, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "ref_ce_loss": 0.0805111974477768, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "loss": 0.3620684742927551, + "step": 19530 + }, + { + "ce_loss": 0.004763866309076548, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "distill_loss": 0.2671721279621124, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "ref_ce_loss": 0.06306330114603043, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "loss": 0.22314409911632538, + "step": 19530 + }, + { + "ce_loss": 0.0018380864057689905, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "distill_loss": 0.08867863565683365, + "epoch": 6.514342895263509, + "step": 19530 + }, + { + "epoch": 6.514342895263509, + "ref_ce_loss": 0.04645582661032677, + "step": 19530 + }, + { + "epoch": 6.517678452301534, + "loss": 0.327, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "grad_norm": 2.6358141899108887, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "learning_rate": 2.620998998368358e-05, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "loss": 0.2660335600376129, + "step": 19540 + }, + { + "ce_loss": 0.006880718749016523, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "distill_loss": 0.15171124041080475, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "ref_ce_loss": 0.05697092413902283, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "loss": 0.34473586082458496, + "step": 19540 + }, + { + "ce_loss": 0.026561858132481575, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "distill_loss": 0.12937229871749878, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "ref_ce_loss": 0.0451975092291832, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "loss": 0.38856256008148193, + "step": 19540 + }, + { + "ce_loss": 0.04068716615438461, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "distill_loss": 0.10566786676645279, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "ref_ce_loss": 0.04714061692357063, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "loss": 0.16135339438915253, + "step": 19540 + }, + { + "ce_loss": 0.02129344828426838, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "distill_loss": 0.08420635759830475, + "epoch": 6.517678452301534, + "step": 19540 + }, + { + "epoch": 6.517678452301534, + "ref_ce_loss": 0.03777875006198883, + "step": 19540 + }, + { + "epoch": 6.521014009339559, + "loss": 0.3013, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "grad_norm": 2.993635654449463, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "learning_rate": 2.6095707714784515e-05, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "loss": 0.28706902265548706, + "step": 19550 + }, + { + "ce_loss": 0.04375440999865532, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "distill_loss": 0.1598789095878601, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "ref_ce_loss": 0.05451327934861183, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "loss": 0.25459298491477966, + "step": 19550 + }, + { + "ce_loss": 0.041393257677555084, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "distill_loss": 0.11711672693490982, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "ref_ce_loss": 0.05613476410508156, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "loss": 0.24426577985286713, + "step": 19550 + }, + { + "ce_loss": 0.044446419924497604, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "distill_loss": 0.10138127952814102, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "ref_ce_loss": 0.06777234375476837, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "loss": 0.3395106792449951, + "step": 19550 + }, + { + "ce_loss": 0.05529210716485977, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "distill_loss": 0.1776595264673233, + "epoch": 6.521014009339559, + "step": 19550 + }, + { + "epoch": 6.521014009339559, + "ref_ce_loss": 0.07718256115913391, + "step": 19550 + }, + { + "epoch": 6.524349566377585, + "loss": 0.3031, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "grad_norm": 2.298203229904175, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "learning_rate": 2.5981651398438262e-05, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "loss": 0.3190702795982361, + "step": 19560 + }, + { + "ce_loss": 0.08099543303251266, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "distill_loss": 0.143859401345253, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "ref_ce_loss": 0.06572287529706955, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "loss": 0.22269880771636963, + "step": 19560 + }, + { + "ce_loss": 0.0325460359454155, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "distill_loss": 0.1178068220615387, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "ref_ce_loss": 0.05695272609591484, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "loss": 0.35457098484039307, + "step": 19560 + }, + { + "ce_loss": 0.02406204119324684, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "distill_loss": 0.18583686649799347, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "ref_ce_loss": 0.07211542874574661, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "loss": 0.34551870822906494, + "step": 19560 + }, + { + "ce_loss": 0.016009874641895294, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "distill_loss": 0.12047820538282394, + "epoch": 6.524349566377585, + "step": 19560 + }, + { + "epoch": 6.524349566377585, + "ref_ce_loss": 0.04428781569004059, + "step": 19560 + }, + { + "epoch": 6.52768512341561, + "loss": 0.2768, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "grad_norm": 2.2297275066375732, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "learning_rate": 2.586782124263867e-05, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "loss": 0.15318286418914795, + "step": 19570 + }, + { + "ce_loss": 0.01596526987850666, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "distill_loss": 0.09388856589794159, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "ref_ce_loss": 0.043243296444416046, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "loss": 0.3122502863407135, + "step": 19570 + }, + { + "ce_loss": 0.07972883433103561, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "distill_loss": 0.11671815812587738, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "ref_ce_loss": 0.09080839902162552, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "loss": 0.3512519299983978, + "step": 19570 + }, + { + "ce_loss": 0.03099985606968403, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "distill_loss": 0.13011573255062103, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "ref_ce_loss": 0.06851420551538467, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "loss": 0.34722644090652466, + "step": 19570 + }, + { + "ce_loss": 0.045901209115982056, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "distill_loss": 0.16597992181777954, + "epoch": 6.52768512341561, + "step": 19570 + }, + { + "epoch": 6.52768512341561, + "ref_ce_loss": 0.056316304951906204, + "step": 19570 + }, + { + "epoch": 6.5310206804536355, + "loss": 0.3432, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "grad_norm": 3.3036727905273438, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "learning_rate": 2.5754217454966937e-05, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "loss": 0.43914103507995605, + "step": 19580 + }, + { + "ce_loss": 0.04930100217461586, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "distill_loss": 0.27969545125961304, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "ref_ce_loss": 0.07513166964054108, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "loss": 0.45070549845695496, + "step": 19580 + }, + { + "ce_loss": 0.0634470283985138, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "distill_loss": 0.15421168506145477, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "ref_ce_loss": 0.05427777022123337, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "loss": 0.41704732179641724, + "step": 19580 + }, + { + "ce_loss": 0.06318405270576477, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "distill_loss": 0.1228623166680336, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "ref_ce_loss": 0.0780521035194397, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "loss": 0.6786539554595947, + "step": 19580 + }, + { + "ce_loss": 0.045191653072834015, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "distill_loss": 0.20268556475639343, + "epoch": 6.5310206804536355, + "step": 19580 + }, + { + "epoch": 6.5310206804536355, + "ref_ce_loss": 0.09469419717788696, + "step": 19580 + }, + { + "epoch": 6.534356237491661, + "loss": 0.3673, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "grad_norm": 3.4037835597991943, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "learning_rate": 2.564084024259159e-05, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "loss": 0.3583972454071045, + "step": 19590 + }, + { + "ce_loss": 0.04135497286915779, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "distill_loss": 0.20269116759300232, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "ref_ce_loss": 0.06698576360940933, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "loss": 0.6243804097175598, + "step": 19590 + }, + { + "ce_loss": 0.05481613427400589, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "distill_loss": 0.16993610560894012, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "ref_ce_loss": 0.07230786234140396, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "loss": 0.2733704745769501, + "step": 19590 + }, + { + "ce_loss": 0.05378222465515137, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "distill_loss": 0.14002224802970886, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "ref_ce_loss": 0.07940661907196045, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "loss": 0.44000738859176636, + "step": 19590 + }, + { + "ce_loss": 0.023452216759324074, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "distill_loss": 0.0927174910902977, + "epoch": 6.534356237491661, + "step": 19590 + }, + { + "epoch": 6.534356237491661, + "ref_ce_loss": 0.03387615829706192, + "step": 19590 + }, + { + "epoch": 6.537691794529686, + "loss": 0.3117, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "grad_norm": 2.190343141555786, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "learning_rate": 2.5527689812267987e-05, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "loss": 0.28801581263542175, + "step": 19600 + }, + { + "ce_loss": 0.051726970821619034, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "distill_loss": 0.1135430559515953, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "ref_ce_loss": 0.022708710283041, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "loss": 0.27752554416656494, + "step": 19600 + }, + { + "ce_loss": 0.028300290927290916, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "distill_loss": 0.1339690089225769, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "ref_ce_loss": 0.06856250762939453, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "loss": 0.3291257619857788, + "step": 19600 + }, + { + "ce_loss": 0.044249553233385086, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "distill_loss": 0.12134141474962234, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "ref_ce_loss": 0.043635156005620956, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "loss": 0.5704269409179688, + "step": 19600 + }, + { + "ce_loss": 0.11629431694746017, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "distill_loss": 0.30639445781707764, + "epoch": 6.537691794529686, + "step": 19600 + }, + { + "epoch": 6.537691794529686, + "ref_ce_loss": 0.07897748053073883, + "step": 19600 + }, + { + "epoch": 6.5410273515677115, + "loss": 0.3411, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "grad_norm": 4.623196601867676, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "learning_rate": 2.5414766370337814e-05, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "loss": 0.21084001660346985, + "step": 19610 + }, + { + "ce_loss": 0.02315536141395569, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "distill_loss": 0.11800826340913773, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "ref_ce_loss": 0.04089812561869621, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "loss": 0.2711634635925293, + "step": 19610 + }, + { + "ce_loss": 0.021899105980992317, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "distill_loss": 0.12212783098220825, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "ref_ce_loss": 0.05432126671075821, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "loss": 0.30355560779571533, + "step": 19610 + }, + { + "ce_loss": 0.04438330605626106, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "distill_loss": 0.13687101006507874, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "ref_ce_loss": 0.08127232640981674, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "loss": 0.42635995149612427, + "step": 19610 + }, + { + "ce_loss": 0.030337288975715637, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "distill_loss": 0.14552177488803864, + "epoch": 6.5410273515677115, + "step": 19610 + }, + { + "epoch": 6.5410273515677115, + "ref_ce_loss": 0.06268715858459473, + "step": 19610 + }, + { + "epoch": 6.544362908605737, + "loss": 0.3284, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "grad_norm": 2.259993076324463, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "learning_rate": 2.530207012272898e-05, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "loss": 0.31314048171043396, + "step": 19620 + }, + { + "ce_loss": 0.07636706531047821, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "distill_loss": 0.16563206911087036, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "ref_ce_loss": 0.05040166527032852, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "loss": 0.22983363270759583, + "step": 19620 + }, + { + "ce_loss": 0.042005062103271484, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "distill_loss": 0.10108568519353867, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "ref_ce_loss": 0.059286121279001236, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "loss": 0.2737887501716614, + "step": 19620 + }, + { + "ce_loss": 0.011679647490382195, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "distill_loss": 0.19483031332492828, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "ref_ce_loss": 0.052262429147958755, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "loss": 0.5098199844360352, + "step": 19620 + }, + { + "ce_loss": 0.13560065627098083, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "distill_loss": 0.20365437865257263, + "epoch": 6.544362908605737, + "step": 19620 + }, + { + "epoch": 6.544362908605737, + "ref_ce_loss": 0.1021995022892952, + "step": 19620 + }, + { + "epoch": 6.547698465643762, + "loss": 0.3097, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "grad_norm": 2.537757635116577, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "learning_rate": 2.5189601274954873e-05, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "loss": 0.7845300436019897, + "step": 19630 + }, + { + "ce_loss": 0.09764357656240463, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "distill_loss": 0.18290607631206512, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "ref_ce_loss": 0.05711343511939049, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "loss": 0.3228994607925415, + "step": 19630 + }, + { + "ce_loss": 0.017980199307203293, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "distill_loss": 0.1762242466211319, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "ref_ce_loss": 0.03918452188372612, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "loss": 0.22011463344097137, + "step": 19630 + }, + { + "ce_loss": 0.04538943991065025, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "distill_loss": 0.11361631751060486, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "ref_ce_loss": 0.04930086433887482, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "loss": 0.45519328117370605, + "step": 19630 + }, + { + "ce_loss": 0.06884275376796722, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "distill_loss": 0.18536312878131866, + "epoch": 6.547698465643762, + "step": 19630 + }, + { + "epoch": 6.547698465643762, + "ref_ce_loss": 0.06121458113193512, + "step": 19630 + }, + { + "epoch": 6.551034022681788, + "loss": 0.3376, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "grad_norm": 2.452275514602661, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "learning_rate": 2.507736003211435e-05, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "loss": 0.5876684188842773, + "step": 19640 + }, + { + "ce_loss": 0.07951506227254868, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "distill_loss": 0.17339465022087097, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "ref_ce_loss": 0.06315672397613525, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "loss": 0.4543311297893524, + "step": 19640 + }, + { + "ce_loss": 0.04960149526596069, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "distill_loss": 0.1443757563829422, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "ref_ce_loss": 0.075065977871418, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "loss": 0.29707980155944824, + "step": 19640 + }, + { + "ce_loss": 0.06787025928497314, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "distill_loss": 0.16085000336170197, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "ref_ce_loss": 0.058474279940128326, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "loss": 0.45773807168006897, + "step": 19640 + }, + { + "ce_loss": 0.06551582366228104, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "distill_loss": 0.12983930110931396, + "epoch": 6.551034022681788, + "step": 19640 + }, + { + "epoch": 6.551034022681788, + "ref_ce_loss": 0.06701023131608963, + "step": 19640 + }, + { + "epoch": 6.554369579719813, + "loss": 0.35, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "grad_norm": 2.6646275520324707, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "learning_rate": 2.4965346598891185e-05, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "loss": 0.32107043266296387, + "step": 19650 + }, + { + "ce_loss": 0.06096203625202179, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "distill_loss": 0.15283983945846558, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "ref_ce_loss": 0.07251904904842377, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "loss": 0.3576314449310303, + "step": 19650 + }, + { + "ce_loss": 0.07515022158622742, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "distill_loss": 0.1723017394542694, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "ref_ce_loss": 0.07873984426259995, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "loss": 0.21501293778419495, + "step": 19650 + }, + { + "ce_loss": 0.009262898936867714, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "distill_loss": 0.15100812911987305, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "ref_ce_loss": 0.0326073132455349, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "loss": 0.429170161485672, + "step": 19650 + }, + { + "ce_loss": 0.0786544531583786, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "distill_loss": 0.2469421774148941, + "epoch": 6.554369579719813, + "step": 19650 + }, + { + "epoch": 6.554369579719813, + "ref_ce_loss": 0.08241055905818939, + "step": 19650 + }, + { + "epoch": 6.557705136757838, + "loss": 0.3269, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "grad_norm": 2.3054118156433105, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "learning_rate": 2.485356117955367e-05, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "loss": 0.3260459899902344, + "step": 19660 + }, + { + "ce_loss": 0.09615436941385269, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "distill_loss": 0.1380174458026886, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "ref_ce_loss": 0.05804324895143509, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "loss": 0.36628609895706177, + "step": 19660 + }, + { + "ce_loss": 0.015312734991312027, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "distill_loss": 0.16019655764102936, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "ref_ce_loss": 0.0526125393807888, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "loss": 0.3451019525527954, + "step": 19660 + }, + { + "ce_loss": 0.024979015812277794, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "distill_loss": 0.18551787734031677, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "ref_ce_loss": 0.040732912719249725, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "loss": 0.3880947232246399, + "step": 19660 + }, + { + "ce_loss": 0.03313489258289337, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "distill_loss": 0.13498447835445404, + "epoch": 6.557705136757838, + "step": 19660 + }, + { + "epoch": 6.557705136757838, + "ref_ce_loss": 0.04829491302371025, + "step": 19660 + }, + { + "epoch": 6.561040693795864, + "loss": 0.3219, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "grad_norm": 5.515630722045898, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "learning_rate": 2.4742003977954333e-05, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "loss": 0.47084906697273254, + "step": 19670 + }, + { + "ce_loss": 0.09112807363271713, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "distill_loss": 0.14611652493476868, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "ref_ce_loss": 0.058892469853162766, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "loss": 0.3811488747596741, + "step": 19670 + }, + { + "ce_loss": 0.04032864049077034, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "distill_loss": 0.1337955743074417, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "ref_ce_loss": 0.08700317144393921, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "loss": 0.2712763547897339, + "step": 19670 + }, + { + "ce_loss": 0.02721991576254368, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "distill_loss": 0.10200801491737366, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "ref_ce_loss": 0.0590171180665493, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "loss": 0.24486324191093445, + "step": 19670 + }, + { + "ce_loss": 0.0423901304602623, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "distill_loss": 0.1491403877735138, + "epoch": 6.561040693795864, + "step": 19670 + }, + { + "epoch": 6.561040693795864, + "ref_ce_loss": 0.05284509062767029, + "step": 19670 + }, + { + "epoch": 6.564376250833889, + "loss": 0.3541, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "grad_norm": 3.7235469818115234, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "learning_rate": 2.4630675197529502e-05, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "loss": 0.4071599841117859, + "step": 19680 + }, + { + "ce_loss": 0.01765054650604725, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "distill_loss": 0.14170853793621063, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "ref_ce_loss": 0.042518824338912964, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "loss": 0.43594300746917725, + "step": 19680 + }, + { + "ce_loss": 0.07345236092805862, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "distill_loss": 0.1204528734087944, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "ref_ce_loss": 0.05486462637782097, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "loss": 0.2822025716304779, + "step": 19680 + }, + { + "ce_loss": 0.057404521852731705, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "distill_loss": 0.15036694705486298, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "ref_ce_loss": 0.05206619203090668, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "loss": 0.21536864340305328, + "step": 19680 + }, + { + "ce_loss": 0.04800006374716759, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "distill_loss": 0.10483649373054504, + "epoch": 6.564376250833889, + "step": 19680 + }, + { + "epoch": 6.564376250833889, + "ref_ce_loss": 0.06246866285800934, + "step": 19680 + }, + { + "epoch": 6.567711807871914, + "loss": 0.3322, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "grad_norm": 2.986670732498169, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "learning_rate": 2.4519575041298934e-05, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "loss": 0.28218936920166016, + "step": 19690 + }, + { + "ce_loss": 0.031078292056918144, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "distill_loss": 0.18602138757705688, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "ref_ce_loss": 0.06485332548618317, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "loss": 0.7765505909919739, + "step": 19690 + }, + { + "ce_loss": 0.065073661506176, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "distill_loss": 0.16883963346481323, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "ref_ce_loss": 0.05374979227781296, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "loss": 0.2536375820636749, + "step": 19690 + }, + { + "ce_loss": 0.03500214219093323, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "distill_loss": 0.14786192774772644, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "ref_ce_loss": 0.05251917988061905, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "loss": 0.146646186709404, + "step": 19690 + }, + { + "ce_loss": 0.03096270188689232, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "distill_loss": 0.07382689416408539, + "epoch": 6.567711807871914, + "step": 19690 + }, + { + "epoch": 6.567711807871914, + "ref_ce_loss": 0.03107365407049656, + "step": 19690 + }, + { + "epoch": 6.57104736490994, + "loss": 0.36, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "grad_norm": 7.0674052238464355, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "learning_rate": 2.4408703711865507e-05, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "loss": 0.21289116144180298, + "step": 19700 + }, + { + "ce_loss": 0.010433454066514969, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "distill_loss": 0.11860831081867218, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "ref_ce_loss": 0.060785289853811264, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "loss": 0.3320046067237854, + "step": 19700 + }, + { + "ce_loss": 0.10327617079019547, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "distill_loss": 0.11185026913881302, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "ref_ce_loss": 0.08796876668930054, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "loss": 0.23405468463897705, + "step": 19700 + }, + { + "ce_loss": 0.022999800741672516, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "distill_loss": 0.1573328971862793, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "ref_ce_loss": 0.05357423424720764, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "loss": 0.2924388349056244, + "step": 19700 + }, + { + "ce_loss": 0.046956613659858704, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "distill_loss": 0.11884532868862152, + "epoch": 6.57104736490994, + "step": 19700 + }, + { + "epoch": 6.57104736490994, + "ref_ce_loss": 0.06552345305681229, + "step": 19700 + }, + { + "epoch": 6.574382921947965, + "loss": 0.3085, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "grad_norm": 3.8683853149414062, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "learning_rate": 2.4298061411414775e-05, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "loss": 0.2815577983856201, + "step": 19710 + }, + { + "ce_loss": 0.03336150571703911, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "distill_loss": 0.16840992867946625, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "ref_ce_loss": 0.049621641635894775, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "loss": 0.38527244329452515, + "step": 19710 + }, + { + "ce_loss": 0.030045749619603157, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "distill_loss": 0.17417395114898682, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "ref_ce_loss": 0.045052722096443176, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "loss": 0.3040308952331543, + "step": 19710 + }, + { + "ce_loss": 0.06124549359083176, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "distill_loss": 0.13505572080612183, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "ref_ce_loss": 0.0780586525797844, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "loss": 0.22426313161849976, + "step": 19710 + }, + { + "ce_loss": 0.008811813779175282, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "distill_loss": 0.1401672214269638, + "epoch": 6.574382921947965, + "step": 19710 + }, + { + "epoch": 6.574382921947965, + "ref_ce_loss": 0.05200808495283127, + "step": 19710 + }, + { + "epoch": 6.57771847898599, + "loss": 0.3185, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "grad_norm": 2.2868518829345703, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "learning_rate": 2.418764834171466e-05, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "loss": 0.36733943223953247, + "step": 19720 + }, + { + "ce_loss": 0.06563469767570496, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "distill_loss": 0.14585933089256287, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "ref_ce_loss": 0.08136098086833954, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "loss": 0.5045070648193359, + "step": 19720 + }, + { + "ce_loss": 0.10214339941740036, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "distill_loss": 0.14387455582618713, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "ref_ce_loss": 0.08991748094558716, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "loss": 0.35117292404174805, + "step": 19720 + }, + { + "ce_loss": 0.05239369720220566, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "distill_loss": 0.1372997909784317, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "ref_ce_loss": 0.07272414863109589, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "loss": 0.21220049262046814, + "step": 19720 + }, + { + "ce_loss": 0.035478733479976654, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "distill_loss": 0.09870263934135437, + "epoch": 6.57771847898599, + "step": 19720 + }, + { + "epoch": 6.57771847898599, + "ref_ce_loss": 0.04688771069049835, + "step": 19720 + }, + { + "epoch": 6.581054036024016, + "loss": 0.3218, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "grad_norm": 2.9975807666778564, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "learning_rate": 2.407746470411508e-05, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "loss": 0.3309047818183899, + "step": 19730 + }, + { + "ce_loss": 0.026400120928883553, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "distill_loss": 0.20293043553829193, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "ref_ce_loss": 0.07825887948274612, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "loss": 0.41066548228263855, + "step": 19730 + }, + { + "ce_loss": 0.0435505174100399, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "distill_loss": 0.17935574054718018, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "ref_ce_loss": 0.06096060946583748, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "loss": 0.5351892113685608, + "step": 19730 + }, + { + "ce_loss": 0.06245271489024162, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "distill_loss": 0.2792205810546875, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "ref_ce_loss": 0.09187360852956772, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "loss": 0.21385572850704193, + "step": 19730 + }, + { + "ce_loss": 0.04891948029398918, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "distill_loss": 0.09563419222831726, + "epoch": 6.581054036024016, + "step": 19730 + }, + { + "epoch": 6.581054036024016, + "ref_ce_loss": 0.05390109866857529, + "step": 19730 + }, + { + "epoch": 6.584389593062041, + "loss": 0.3156, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "grad_norm": 3.010976791381836, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "learning_rate": 2.3967510699547453e-05, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "loss": 0.3294471204280853, + "step": 19740 + }, + { + "ce_loss": 0.05028639733791351, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "distill_loss": 0.17179375886917114, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "ref_ce_loss": 0.04911860451102257, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "loss": 0.9547842741012573, + "step": 19740 + }, + { + "ce_loss": 0.062249574810266495, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "distill_loss": 0.17876005172729492, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "ref_ce_loss": 0.08448649197816849, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "loss": 0.1854000985622406, + "step": 19740 + }, + { + "ce_loss": 0.03438950702548027, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "distill_loss": 0.10151194036006927, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "ref_ce_loss": 0.034116681665182114, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "loss": 0.24452723562717438, + "step": 19740 + }, + { + "ce_loss": 0.02735762856900692, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "distill_loss": 0.14382018148899078, + "epoch": 6.584389593062041, + "step": 19740 + }, + { + "epoch": 6.584389593062041, + "ref_ce_loss": 0.054051473736763, + "step": 19740 + }, + { + "epoch": 6.587725150100066, + "loss": 0.3189, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "grad_norm": 3.135653495788574, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "learning_rate": 2.3857786528524607e-05, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "loss": 0.21763204038143158, + "step": 19750 + }, + { + "ce_loss": 0.04860735684633255, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "distill_loss": 0.12845882773399353, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "ref_ce_loss": 0.04027068614959717, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "loss": 0.466017484664917, + "step": 19750 + }, + { + "ce_loss": 0.09125525504350662, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "distill_loss": 0.25751644372940063, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "ref_ce_loss": 0.05083215609192848, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "loss": 0.316772997379303, + "step": 19750 + }, + { + "ce_loss": 0.007712164893746376, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "distill_loss": 0.14403752982616425, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "ref_ce_loss": 0.052840352058410645, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "loss": 0.352708637714386, + "step": 19750 + }, + { + "ce_loss": 0.05708223953843117, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "distill_loss": 0.15306146442890167, + "epoch": 6.587725150100066, + "step": 19750 + }, + { + "epoch": 6.587725150100066, + "ref_ce_loss": 0.05850322172045708, + "step": 19750 + }, + { + "epoch": 6.591060707138092, + "loss": 0.3042, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "grad_norm": 2.567244291305542, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "learning_rate": 2.374829239114e-05, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "loss": 0.25095030665397644, + "step": 19760 + }, + { + "ce_loss": 0.0396430529654026, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "distill_loss": 0.0905349925160408, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "ref_ce_loss": 0.053239136934280396, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "loss": 0.28194355964660645, + "step": 19760 + }, + { + "ce_loss": 0.044967278838157654, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "distill_loss": 0.1526344120502472, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "ref_ce_loss": 0.058355461806058884, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "loss": 0.277133584022522, + "step": 19760 + }, + { + "ce_loss": 0.02993590198457241, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "distill_loss": 0.09442713856697083, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "ref_ce_loss": 0.0476006343960762, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "loss": 0.6724720001220703, + "step": 19760 + }, + { + "ce_loss": 0.046295564621686935, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "distill_loss": 0.1278403103351593, + "epoch": 6.591060707138092, + "step": 19760 + }, + { + "epoch": 6.591060707138092, + "ref_ce_loss": 0.035039693117141724, + "step": 19760 + }, + { + "epoch": 6.594396264176117, + "loss": 0.3334, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "grad_norm": 2.155388355255127, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "learning_rate": 2.363902848706789e-05, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "loss": 0.2597852349281311, + "step": 19770 + }, + { + "ce_loss": 0.03040352649986744, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "distill_loss": 0.1443725973367691, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "ref_ce_loss": 0.08462069928646088, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "loss": 0.17788177728652954, + "step": 19770 + }, + { + "ce_loss": 0.00964513048529625, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "distill_loss": 0.11348067969083786, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "ref_ce_loss": 0.05442032590508461, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "loss": 0.31917548179626465, + "step": 19770 + }, + { + "ce_loss": 0.046949710696935654, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "distill_loss": 0.14541833102703094, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "ref_ce_loss": 0.09034581482410431, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "loss": 0.28402474522590637, + "step": 19770 + }, + { + "ce_loss": 0.03271313011646271, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "distill_loss": 0.14252933859825134, + "epoch": 6.594396264176117, + "step": 19770 + }, + { + "epoch": 6.594396264176117, + "ref_ce_loss": 0.05029868707060814, + "step": 19770 + }, + { + "epoch": 6.5977318212141425, + "loss": 0.2999, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "grad_norm": 2.598196029663086, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "learning_rate": 2.352999501556251e-05, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "loss": 0.37572720646858215, + "step": 19780 + }, + { + "ce_loss": 0.027209611609578133, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "distill_loss": 0.16785335540771484, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "ref_ce_loss": 0.06948821246623993, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "loss": 0.46975427865982056, + "step": 19780 + }, + { + "ce_loss": 0.040823787450790405, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "distill_loss": 0.10489165782928467, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "ref_ce_loss": 0.04194294288754463, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "loss": 0.20247483253479004, + "step": 19780 + }, + { + "ce_loss": 0.04620786011219025, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "distill_loss": 0.10938869416713715, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "ref_ce_loss": 0.04655927047133446, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "loss": 0.35143670439720154, + "step": 19780 + }, + { + "ce_loss": 0.06539808213710785, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "distill_loss": 0.1366189569234848, + "epoch": 6.5977318212141425, + "step": 19780 + }, + { + "epoch": 6.5977318212141425, + "ref_ce_loss": 0.06182894483208656, + "step": 19780 + }, + { + "epoch": 6.601067378252168, + "loss": 0.3129, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "grad_norm": 2.330700159072876, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "learning_rate": 2.3421192175457837e-05, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "loss": 0.30677473545074463, + "step": 19790 + }, + { + "ce_loss": 0.025118449702858925, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "distill_loss": 0.17951764166355133, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "ref_ce_loss": 0.036090608686208725, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "loss": 0.4100573658943176, + "step": 19790 + }, + { + "ce_loss": 0.057707469910383224, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "distill_loss": 0.2209986001253128, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "ref_ce_loss": 0.09653498977422714, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "loss": 0.24320702254772186, + "step": 19790 + }, + { + "ce_loss": 0.03562565892934799, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "distill_loss": 0.13927190005779266, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "ref_ce_loss": 0.03783516213297844, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "loss": 0.32648521661758423, + "step": 19790 + }, + { + "ce_loss": 0.038021087646484375, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "distill_loss": 0.14461548626422882, + "epoch": 6.601067378252168, + "step": 19790 + }, + { + "epoch": 6.601067378252168, + "ref_ce_loss": 0.068316750228405, + "step": 19790 + }, + { + "epoch": 6.604402935290193, + "loss": 0.3589, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "grad_norm": 3.626997947692871, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "learning_rate": 2.331262016516736e-05, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "loss": 0.557469367980957, + "step": 19800 + }, + { + "ce_loss": 0.04689061641693115, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "distill_loss": 0.10365889221429825, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "ref_ce_loss": 0.04763340950012207, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "loss": 0.24095264077186584, + "step": 19800 + }, + { + "ce_loss": 0.009478450752794743, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "distill_loss": 0.16503193974494934, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "ref_ce_loss": 0.039028752595186234, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "loss": 0.2705317437648773, + "step": 19800 + }, + { + "ce_loss": 0.034018658101558685, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "distill_loss": 0.15237151086330414, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "ref_ce_loss": 0.08372320234775543, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "loss": 0.206894651055336, + "step": 19800 + }, + { + "ce_loss": 0.023637909442186356, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "distill_loss": 0.13018612563610077, + "epoch": 6.604402935290193, + "step": 19800 + }, + { + "epoch": 6.604402935290193, + "ref_ce_loss": 0.03761601820588112, + "step": 19800 + }, + { + "epoch": 6.6077384923282185, + "loss": 0.32, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "grad_norm": 2.208696126937866, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "learning_rate": 2.320427918268367e-05, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "loss": 0.16685768961906433, + "step": 19810 + }, + { + "ce_loss": 0.032697293907403946, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "distill_loss": 0.10233201086521149, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "ref_ce_loss": 0.03176421672105789, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "loss": 0.3800828158855438, + "step": 19810 + }, + { + "ce_loss": 0.042058251798152924, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "distill_loss": 0.12607955932617188, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "ref_ce_loss": 0.07670360058546066, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "loss": 0.22122445702552795, + "step": 19810 + }, + { + "ce_loss": 0.01819663867354393, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "distill_loss": 0.14853982627391815, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "ref_ce_loss": 0.03529641404747963, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "loss": 0.6402498483657837, + "step": 19810 + }, + { + "ce_loss": 0.06503716856241226, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "distill_loss": 0.13584941625595093, + "epoch": 6.6077384923282185, + "step": 19810 + }, + { + "epoch": 6.6077384923282185, + "ref_ce_loss": 0.06338845938444138, + "step": 19810 + }, + { + "epoch": 6.611074049366244, + "loss": 0.3375, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "grad_norm": 2.6924424171447754, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "learning_rate": 2.3096169425577826e-05, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "loss": 0.21234257519245148, + "step": 19820 + }, + { + "ce_loss": 0.030125008895993233, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "distill_loss": 0.13141590356826782, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "ref_ce_loss": 0.05019145458936691, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "loss": 0.21372869610786438, + "step": 19820 + }, + { + "ce_loss": 0.023508066311478615, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "distill_loss": 0.14203333854675293, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "ref_ce_loss": 0.03470898047089577, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "loss": 0.25468939542770386, + "step": 19820 + }, + { + "ce_loss": 0.04465780407190323, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "distill_loss": 0.13727760314941406, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "ref_ce_loss": 0.07237095385789871, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "loss": 0.24952755868434906, + "step": 19820 + }, + { + "ce_loss": 0.03197052329778671, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "distill_loss": 0.1438474804162979, + "epoch": 6.611074049366244, + "step": 19820 + }, + { + "epoch": 6.611074049366244, + "ref_ce_loss": 0.049808233976364136, + "step": 19820 + }, + { + "epoch": 6.614409606404269, + "loss": 0.3421, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "grad_norm": 2.183060646057129, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "learning_rate": 2.2988291090999555e-05, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "loss": 0.18967588245868683, + "step": 19830 + }, + { + "ce_loss": 0.019939763471484184, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "distill_loss": 0.13336925208568573, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "ref_ce_loss": 0.03612978756427765, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "loss": 0.18258681893348694, + "step": 19830 + }, + { + "ce_loss": 0.020963216200470924, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "distill_loss": 0.09793965518474579, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "ref_ce_loss": 0.04985945671796799, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "loss": 0.2376529574394226, + "step": 19830 + }, + { + "ce_loss": 0.029153717681765556, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "distill_loss": 0.14805583655834198, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "ref_ce_loss": 0.04305458813905716, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "loss": 0.38180723786354065, + "step": 19830 + }, + { + "ce_loss": 0.11473371833562851, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "distill_loss": 0.14703942835330963, + "epoch": 6.614409606404269, + "step": 19830 + }, + { + "epoch": 6.614409606404269, + "ref_ce_loss": 0.0409589558839798, + "step": 19830 + }, + { + "epoch": 6.617745163442295, + "loss": 0.3106, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "grad_norm": 3.1054604053497314, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "learning_rate": 2.2880644375676276e-05, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "loss": 0.25367286801338196, + "step": 19840 + }, + { + "ce_loss": 0.06676454842090607, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "distill_loss": 0.1250157654285431, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "ref_ce_loss": 0.046141307801008224, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "loss": 0.27414095401763916, + "step": 19840 + }, + { + "ce_loss": 0.005423164926469326, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "distill_loss": 0.11123090237379074, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "ref_ce_loss": 0.025682704523205757, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "loss": 0.5452802777290344, + "step": 19840 + }, + { + "ce_loss": 0.05291275680065155, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "distill_loss": 0.09778635203838348, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "ref_ce_loss": 0.05122615769505501, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "loss": 0.3839438259601593, + "step": 19840 + }, + { + "ce_loss": 0.06498966366052628, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "distill_loss": 0.22986114025115967, + "epoch": 6.617745163442295, + "step": 19840 + }, + { + "epoch": 6.617745163442295, + "ref_ce_loss": 0.06074221059679985, + "step": 19840 + }, + { + "epoch": 6.62108072048032, + "loss": 0.3192, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "grad_norm": 1.9881457090377808, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "learning_rate": 2.2773229475913163e-05, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "loss": 0.3907688856124878, + "step": 19850 + }, + { + "ce_loss": 0.09534867107868195, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "distill_loss": 0.1554652899503708, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "ref_ce_loss": 0.07192099839448929, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "loss": 0.2497331202030182, + "step": 19850 + }, + { + "ce_loss": 0.01978166028857231, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "distill_loss": 0.12318174540996552, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "ref_ce_loss": 0.040960054844617844, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "loss": 0.31594735383987427, + "step": 19850 + }, + { + "ce_loss": 0.029789138585329056, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "distill_loss": 0.1177356094121933, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "ref_ce_loss": 0.05026852339506149, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "loss": 0.4592970609664917, + "step": 19850 + }, + { + "ce_loss": 0.15999820828437805, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "distill_loss": 0.21153071522712708, + "epoch": 6.62108072048032, + "step": 19850 + }, + { + "epoch": 6.62108072048032, + "ref_ce_loss": 0.0713123083114624, + "step": 19850 + }, + { + "epoch": 6.624416277518345, + "loss": 0.3245, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "grad_norm": 2.375483274459839, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "learning_rate": 2.266604658759264e-05, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "loss": 0.3437642455101013, + "step": 19860 + }, + { + "ce_loss": 0.04593540355563164, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "distill_loss": 0.11273406445980072, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "ref_ce_loss": 0.07087185233831406, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "loss": 0.2620472311973572, + "step": 19860 + }, + { + "ce_loss": 0.032802462577819824, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "distill_loss": 0.15501290559768677, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "ref_ce_loss": 0.042995888739824295, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "loss": 0.24187412858009338, + "step": 19860 + }, + { + "ce_loss": 0.036589235067367554, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "distill_loss": 0.13851779699325562, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "ref_ce_loss": 0.04687894880771637, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "loss": 0.2755400836467743, + "step": 19860 + }, + { + "ce_loss": 0.049808893352746964, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "distill_loss": 0.17341046035289764, + "epoch": 6.624416277518345, + "step": 19860 + }, + { + "epoch": 6.624416277518345, + "ref_ce_loss": 0.05203511193394661, + "step": 19860 + }, + { + "epoch": 6.627751834556371, + "loss": 0.3231, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "grad_norm": 3.1982598304748535, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "learning_rate": 2.2559095906173975e-05, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "loss": 0.2873413562774658, + "step": 19870 + }, + { + "ce_loss": 0.06314130872488022, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "distill_loss": 0.12433145940303802, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "ref_ce_loss": 0.07966389507055283, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "loss": 0.29026103019714355, + "step": 19870 + }, + { + "ce_loss": 0.027912983670830727, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "distill_loss": 0.20554634928703308, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "ref_ce_loss": 0.05665810778737068, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "loss": 0.26199522614479065, + "step": 19870 + }, + { + "ce_loss": 0.04706485942006111, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "distill_loss": 0.13014474511146545, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "ref_ce_loss": 0.044131312519311905, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "loss": 0.2666011452674866, + "step": 19870 + }, + { + "ce_loss": 0.04390444606542587, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "distill_loss": 0.12119217962026596, + "epoch": 6.627751834556371, + "step": 19870 + }, + { + "epoch": 6.627751834556371, + "ref_ce_loss": 0.08273381739854813, + "step": 19870 + }, + { + "epoch": 6.631087391594396, + "loss": 0.3399, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "grad_norm": 2.9726524353027344, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "learning_rate": 2.2452377626693036e-05, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "loss": 0.44170236587524414, + "step": 19880 + }, + { + "ce_loss": 0.08167297393083572, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "distill_loss": 0.18584305047988892, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "ref_ce_loss": 0.07174765318632126, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "loss": 0.32071903347969055, + "step": 19880 + }, + { + "ce_loss": 0.04414434731006622, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "distill_loss": 0.13835738599300385, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "ref_ce_loss": 0.05073229968547821, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "loss": 0.2592620849609375, + "step": 19880 + }, + { + "ce_loss": 0.06465975940227509, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "distill_loss": 0.14733535051345825, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "ref_ce_loss": 0.047104138880968094, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "loss": 0.38310331106185913, + "step": 19880 + }, + { + "ce_loss": 0.10056553035974503, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "distill_loss": 0.1666891872882843, + "epoch": 6.631087391594396, + "step": 19880 + }, + { + "epoch": 6.631087391594396, + "ref_ce_loss": 0.07143868505954742, + "step": 19880 + }, + { + "epoch": 6.634422948632421, + "loss": 0.362, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "grad_norm": 2.2830758094787598, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "learning_rate": 2.2345891943761868e-05, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "loss": 0.228542760014534, + "step": 19890 + }, + { + "ce_loss": 0.012633942998945713, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "distill_loss": 0.10933070629835129, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "ref_ce_loss": 0.03913666307926178, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "loss": 0.25820690393447876, + "step": 19890 + }, + { + "ce_loss": 0.047330550849437714, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "distill_loss": 0.11249566078186035, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "ref_ce_loss": 0.07109885662794113, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "loss": 0.2614418566226959, + "step": 19890 + }, + { + "ce_loss": 0.04621879383921623, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "distill_loss": 0.12126925587654114, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "ref_ce_loss": 0.06826559454202652, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "loss": 0.3106783330440521, + "step": 19890 + }, + { + "ce_loss": 0.08366413414478302, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "distill_loss": 0.1382652223110199, + "epoch": 6.634422948632421, + "step": 19890 + }, + { + "epoch": 6.634422948632421, + "ref_ce_loss": 0.05525009334087372, + "step": 19890 + }, + { + "epoch": 6.637758505670447, + "loss": 0.3104, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "grad_norm": 2.347456216812134, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "learning_rate": 2.223963905156837e-05, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "loss": 0.37175440788269043, + "step": 19900 + }, + { + "ce_loss": 0.026647180318832397, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "distill_loss": 0.2211373895406723, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "ref_ce_loss": 0.07487079501152039, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "loss": 0.31490853428840637, + "step": 19900 + }, + { + "ce_loss": 0.025211889296770096, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "distill_loss": 0.18323150277137756, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "ref_ce_loss": 0.07476285099983215, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "loss": 0.3347644507884979, + "step": 19900 + }, + { + "ce_loss": 0.07297145575284958, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "distill_loss": 0.17718927562236786, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "ref_ce_loss": 0.05877537652850151, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "loss": 0.24840539693832397, + "step": 19900 + }, + { + "ce_loss": 0.026537470519542694, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "distill_loss": 0.14945150911808014, + "epoch": 6.637758505670447, + "step": 19900 + }, + { + "epoch": 6.637758505670447, + "ref_ce_loss": 0.049950964748859406, + "step": 19900 + }, + { + "epoch": 6.641094062708472, + "loss": 0.3258, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "grad_norm": 3.141188383102417, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "learning_rate": 2.2133619143875915e-05, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "loss": 0.16172029078006744, + "step": 19910 + }, + { + "ce_loss": 0.019133074209094048, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "distill_loss": 0.09872619807720184, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "ref_ce_loss": 0.04360087215900421, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "loss": 0.42412227392196655, + "step": 19910 + }, + { + "ce_loss": 0.06478115171194077, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "distill_loss": 0.14595475792884827, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "ref_ce_loss": 0.10079323500394821, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "loss": 0.4226725697517395, + "step": 19910 + }, + { + "ce_loss": 0.0367148257791996, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "distill_loss": 0.21141110360622406, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "ref_ce_loss": 0.07302436232566833, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "loss": 0.289275586605072, + "step": 19910 + }, + { + "ce_loss": 0.02699970081448555, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "distill_loss": 0.18730556964874268, + "epoch": 6.641094062708472, + "step": 19910 + }, + { + "epoch": 6.641094062708472, + "ref_ce_loss": 0.052146654576063156, + "step": 19910 + }, + { + "epoch": 6.644429619746497, + "loss": 0.3361, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "grad_norm": 2.280363082885742, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "learning_rate": 2.2027832414022946e-05, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "loss": 0.29020074009895325, + "step": 19920 + }, + { + "ce_loss": 0.04425665736198425, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "distill_loss": 0.16273631155490875, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "ref_ce_loss": 0.08309007436037064, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "loss": 0.26776793599128723, + "step": 19920 + }, + { + "ce_loss": 0.03876633942127228, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "distill_loss": 0.1577092856168747, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "ref_ce_loss": 0.053327981382608414, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "loss": 1.059072732925415, + "step": 19920 + }, + { + "ce_loss": 0.05745551362633705, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "distill_loss": 0.19399422407150269, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "ref_ce_loss": 0.05763303115963936, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "loss": 0.21548689901828766, + "step": 19920 + }, + { + "ce_loss": 0.027986306697130203, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "distill_loss": 0.10086776316165924, + "epoch": 6.644429619746497, + "step": 19920 + }, + { + "epoch": 6.644429619746497, + "ref_ce_loss": 0.05304734781384468, + "step": 19920 + }, + { + "epoch": 6.647765176784523, + "loss": 0.3455, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "grad_norm": 2.4683637619018555, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "learning_rate": 2.192227905492275e-05, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "loss": 0.26920247077941895, + "step": 19930 + }, + { + "ce_loss": 0.06250952929258347, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "distill_loss": 0.1409088373184204, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "ref_ce_loss": 0.05059449374675751, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "loss": 0.1810358762741089, + "step": 19930 + }, + { + "ce_loss": 0.023886408656835556, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "distill_loss": 0.11785140633583069, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "ref_ce_loss": 0.03921708092093468, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "loss": 0.23555897176265717, + "step": 19930 + }, + { + "ce_loss": 0.029333343729376793, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "distill_loss": 0.12829671800136566, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "ref_ce_loss": 0.05001313239336014, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "loss": 0.3103119134902954, + "step": 19930 + }, + { + "ce_loss": 0.04330439865589142, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "distill_loss": 0.13884636759757996, + "epoch": 6.647765176784523, + "step": 19930 + }, + { + "epoch": 6.647765176784523, + "ref_ce_loss": 0.07495592534542084, + "step": 19930 + }, + { + "epoch": 6.651100733822548, + "loss": 0.3722, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "grad_norm": 2.8103508949279785, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "learning_rate": 2.1816959259063034e-05, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "loss": 0.7184747457504272, + "step": 19940 + }, + { + "ce_loss": 0.07260715216398239, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "distill_loss": 0.17788881063461304, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "ref_ce_loss": 0.05647752434015274, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "loss": 0.34915584325790405, + "step": 19940 + }, + { + "ce_loss": 0.028711065649986267, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "distill_loss": 0.12251932173967361, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "ref_ce_loss": 0.057379350066185, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "loss": 0.41050225496292114, + "step": 19940 + }, + { + "ce_loss": 0.008283420465886593, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "distill_loss": 0.2010456621646881, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "ref_ce_loss": 0.062345150858163834, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "loss": 0.24619829654693604, + "step": 19940 + }, + { + "ce_loss": 0.03133643418550491, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "distill_loss": 0.11145929992198944, + "epoch": 6.651100733822548, + "step": 19940 + }, + { + "epoch": 6.651100733822548, + "ref_ce_loss": 0.03711400181055069, + "step": 19940 + }, + { + "epoch": 6.654436290860573, + "loss": 0.3623, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "grad_norm": 5.005858898162842, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "learning_rate": 2.1711873218505533e-05, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "loss": 0.26657575368881226, + "step": 19950 + }, + { + "ce_loss": 0.05676257982850075, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "distill_loss": 0.1302073895931244, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "ref_ce_loss": 0.06166147440671921, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "loss": 0.8528383374214172, + "step": 19950 + }, + { + "ce_loss": 0.13403521478176117, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "distill_loss": 0.1755397915840149, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "ref_ce_loss": 0.0475611612200737, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "loss": 0.23576173186302185, + "step": 19950 + }, + { + "ce_loss": 0.032652635127305984, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "distill_loss": 0.11732666939496994, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "ref_ce_loss": 0.053122635930776596, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "loss": 0.23466111719608307, + "step": 19950 + }, + { + "ce_loss": 0.018693789839744568, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "distill_loss": 0.1024949923157692, + "epoch": 6.654436290860573, + "step": 19950 + }, + { + "epoch": 6.654436290860573, + "ref_ce_loss": 0.0520513616502285, + "step": 19950 + }, + { + "epoch": 6.657771847898599, + "loss": 0.3175, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "grad_norm": 2.6065797805786133, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "learning_rate": 2.160702112488577e-05, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "loss": 0.29395872354507446, + "step": 19960 + }, + { + "ce_loss": 0.0614352822303772, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "distill_loss": 0.14185380935668945, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "ref_ce_loss": 0.06420893967151642, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "loss": 0.5165930390357971, + "step": 19960 + }, + { + "ce_loss": 0.018710751086473465, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "distill_loss": 0.27090543508529663, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "ref_ce_loss": 0.05845800042152405, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "loss": 0.2694173753261566, + "step": 19960 + }, + { + "ce_loss": 0.034820299595594406, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "distill_loss": 0.0995931401848793, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "ref_ce_loss": 0.023787811398506165, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "loss": 0.21081291139125824, + "step": 19960 + }, + { + "ce_loss": 0.04316055029630661, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "distill_loss": 0.10261419415473938, + "epoch": 6.657771847898599, + "step": 19960 + }, + { + "epoch": 6.657771847898599, + "ref_ce_loss": 0.04531471058726311, + "step": 19960 + }, + { + "epoch": 6.661107404936624, + "loss": 0.3254, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "grad_norm": 3.9140877723693848, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "learning_rate": 2.1502403169412564e-05, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "loss": 0.4480225741863251, + "step": 19970 + }, + { + "ce_loss": 0.09914959967136383, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "distill_loss": 0.2714824974536896, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "ref_ce_loss": 0.05960004776716232, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "loss": 0.29610762000083923, + "step": 19970 + }, + { + "ce_loss": 0.05531436204910278, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "distill_loss": 0.12633873522281647, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "ref_ce_loss": 0.02944735810160637, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "loss": 0.2766069173812866, + "step": 19970 + }, + { + "ce_loss": 0.08344614505767822, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "distill_loss": 0.12200739234685898, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "ref_ce_loss": 0.07094717770814896, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "loss": 0.29538315534591675, + "step": 19970 + }, + { + "ce_loss": 0.028203513473272324, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "distill_loss": 0.10594826191663742, + "epoch": 6.661107404936624, + "step": 19970 + }, + { + "epoch": 6.661107404936624, + "ref_ce_loss": 0.03950703516602516, + "step": 19970 + }, + { + "epoch": 6.6644429619746495, + "loss": 0.3542, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "grad_norm": 2.6698086261749268, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "learning_rate": 2.13980195428678e-05, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "loss": 0.20336762070655823, + "step": 19980 + }, + { + "ce_loss": 0.013415360823273659, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "distill_loss": 0.09042014181613922, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "ref_ce_loss": 0.042300231754779816, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "loss": 0.2975046932697296, + "step": 19980 + }, + { + "ce_loss": 0.051743652671575546, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "distill_loss": 0.144939586520195, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "ref_ce_loss": 0.05688483640551567, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "loss": 0.2748930752277374, + "step": 19980 + }, + { + "ce_loss": 0.048272471874952316, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "distill_loss": 0.13878586888313293, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "ref_ce_loss": 0.07151763886213303, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "loss": 0.1987239122390747, + "step": 19980 + }, + { + "ce_loss": 0.020666783675551414, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "distill_loss": 0.09921713918447495, + "epoch": 6.6644429619746495, + "step": 19980 + }, + { + "epoch": 6.6644429619746495, + "ref_ce_loss": 0.05615457147359848, + "step": 19980 + }, + { + "epoch": 6.667778519012675, + "loss": 0.3404, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "grad_norm": 4.083954811096191, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "learning_rate": 2.1293870435606047e-05, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "loss": 0.3248312473297119, + "step": 19990 + }, + { + "ce_loss": 0.07799259573221207, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "distill_loss": 0.17638051509857178, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "ref_ce_loss": 0.06979241222143173, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "loss": 0.3776983916759491, + "step": 19990 + }, + { + "ce_loss": 0.08105441927909851, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "distill_loss": 0.18132013082504272, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "ref_ce_loss": 0.08397438377141953, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "loss": 0.4606001377105713, + "step": 19990 + }, + { + "ce_loss": 0.10863593220710754, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "distill_loss": 0.16064369678497314, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "ref_ce_loss": 0.10042203217744827, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "loss": 0.3327219486236572, + "step": 19990 + }, + { + "ce_loss": 0.06780189275741577, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "distill_loss": 0.2290269136428833, + "epoch": 6.667778519012675, + "step": 19990 + }, + { + "epoch": 6.667778519012675, + "ref_ce_loss": 0.03579997643828392, + "step": 19990 + }, + { + "epoch": 6.6711140760507, + "loss": 0.3328, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "grad_norm": 4.256439208984375, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "learning_rate": 2.1189956037554197e-05, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "loss": 0.22355251014232635, + "step": 20000 + }, + { + "ce_loss": 0.022118112072348595, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "distill_loss": 0.12368350476026535, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "ref_ce_loss": 0.07756340503692627, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "loss": 0.24665537476539612, + "step": 20000 + }, + { + "ce_loss": 0.018713662400841713, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "distill_loss": 0.1372748613357544, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "ref_ce_loss": 0.026346355676651, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "loss": 0.23919732868671417, + "step": 20000 + }, + { + "ce_loss": 0.03264044597744942, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "distill_loss": 0.13985341787338257, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "ref_ce_loss": 0.0665779784321785, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "loss": 0.197180837392807, + "step": 20000 + }, + { + "ce_loss": 0.017834221944212914, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "distill_loss": 0.11055406928062439, + "epoch": 6.6711140760507, + "step": 20000 + }, + { + "epoch": 6.6711140760507, + "ref_ce_loss": 0.049189746379852295, + "step": 20000 + }, + { + "epoch": 6.6744496330887255, + "loss": 0.3045, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "grad_norm": 2.5865511894226074, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "learning_rate": 2.1086276538211144e-05, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "loss": 0.42007237672805786, + "step": 20010 + }, + { + "ce_loss": 0.08485287427902222, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "distill_loss": 0.14501990377902985, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "ref_ce_loss": 0.07646860182285309, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "loss": 0.3304160237312317, + "step": 20010 + }, + { + "ce_loss": 0.018804801627993584, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "distill_loss": 0.11846814304590225, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "ref_ce_loss": 0.05221167951822281, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "loss": 0.23049429059028625, + "step": 20010 + }, + { + "ce_loss": 0.024256093427538872, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "distill_loss": 0.15001031756401062, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "ref_ce_loss": 0.05604208633303642, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "loss": 0.29497581720352173, + "step": 20010 + }, + { + "ce_loss": 0.012918825261294842, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "distill_loss": 0.11682042479515076, + "epoch": 6.6744496330887255, + "step": 20010 + }, + { + "epoch": 6.6744496330887255, + "ref_ce_loss": 0.05747944861650467, + "step": 20010 + }, + { + "epoch": 6.677785190126751, + "loss": 0.319, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "grad_norm": 2.0363032817840576, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "learning_rate": 2.0982832126647352e-05, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "loss": 0.44392961263656616, + "step": 20020 + }, + { + "ce_loss": 0.0858464390039444, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "distill_loss": 0.2778363525867462, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "ref_ce_loss": 0.08002053946256638, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "loss": 0.2499372363090515, + "step": 20020 + }, + { + "ce_loss": 0.026511620730161667, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "distill_loss": 0.09822176396846771, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "ref_ce_loss": 0.03569779545068741, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "loss": 0.2579372823238373, + "step": 20020 + }, + { + "ce_loss": 0.008953968994319439, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "distill_loss": 0.1450115293264389, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "ref_ce_loss": 0.08302398025989532, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "loss": 0.26329633593559265, + "step": 20020 + }, + { + "ce_loss": 0.08157920092344284, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "distill_loss": 0.1411052644252777, + "epoch": 6.677785190126751, + "step": 20020 + }, + { + "epoch": 6.677785190126751, + "ref_ce_loss": 0.031311504542827606, + "step": 20020 + }, + { + "epoch": 6.681120747164776, + "loss": 0.3117, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "grad_norm": 2.237546682357788, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "learning_rate": 2.087962299150464e-05, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "loss": 0.3706545829772949, + "step": 20030 + }, + { + "ce_loss": 0.05204419791698456, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "distill_loss": 0.17340388894081116, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "ref_ce_loss": 0.10555092245340347, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "loss": 0.38419580459594727, + "step": 20030 + }, + { + "ce_loss": 0.029340149834752083, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "distill_loss": 0.28156542778015137, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "ref_ce_loss": 0.058754369616508484, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "loss": 0.46436163783073425, + "step": 20030 + }, + { + "ce_loss": 0.031260956078767776, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "distill_loss": 0.15603461861610413, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "ref_ce_loss": 0.06692703813314438, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "loss": 0.1949024498462677, + "step": 20030 + }, + { + "ce_loss": 0.016338814049959183, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "distill_loss": 0.10759077966213226, + "epoch": 6.681120747164776, + "step": 20030 + }, + { + "epoch": 6.681120747164776, + "ref_ce_loss": 0.05400192365050316, + "step": 20030 + }, + { + "epoch": 6.684456304202802, + "loss": 0.3256, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "grad_norm": 4.661191940307617, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "learning_rate": 2.0776649320995754e-05, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "loss": 0.18244490027427673, + "step": 20040 + }, + { + "ce_loss": 0.004614561330527067, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "distill_loss": 0.12099388986825943, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "ref_ce_loss": 0.035793665796518326, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "loss": 0.19822382926940918, + "step": 20040 + }, + { + "ce_loss": 0.03608737140893936, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "distill_loss": 0.11221136897802353, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "ref_ce_loss": 0.03879408538341522, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "loss": 0.32134145498275757, + "step": 20040 + }, + { + "ce_loss": 0.050765812397003174, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "distill_loss": 0.17691972851753235, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "ref_ce_loss": 0.04532846063375473, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "loss": 0.180606871843338, + "step": 20040 + }, + { + "ce_loss": 0.036758411675691605, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "distill_loss": 0.07417429238557816, + "epoch": 6.684456304202802, + "step": 20040 + }, + { + "epoch": 6.684456304202802, + "ref_ce_loss": 0.042639512568712234, + "step": 20040 + }, + { + "epoch": 6.687791861240827, + "loss": 0.2825, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "grad_norm": 2.5875606536865234, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "learning_rate": 2.0673911302904046e-05, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "loss": 0.18170462548732758, + "step": 20050 + }, + { + "ce_loss": 0.025139065459370613, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "distill_loss": 0.07967349141836166, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "ref_ce_loss": 0.040121741592884064, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "loss": 0.4045209586620331, + "step": 20050 + }, + { + "ce_loss": 0.02009030058979988, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "distill_loss": 0.13364392518997192, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "ref_ce_loss": 0.03764891251921654, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "loss": 0.5435401201248169, + "step": 20050 + }, + { + "ce_loss": 0.05513055622577667, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "distill_loss": 0.12308389693498611, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "ref_ce_loss": 0.0426427386701107, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "loss": 0.31125912070274353, + "step": 20050 + }, + { + "ce_loss": 0.078745536506176, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "distill_loss": 0.1420706957578659, + "epoch": 6.687791861240827, + "step": 20050 + }, + { + "epoch": 6.687791861240827, + "ref_ce_loss": 0.050845950841903687, + "step": 20050 + }, + { + "epoch": 6.691127418278852, + "loss": 0.3104, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "grad_norm": 2.9328770637512207, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "learning_rate": 2.05714091245832e-05, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "loss": 0.28509998321533203, + "step": 20060 + }, + { + "ce_loss": 0.050861671566963196, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "distill_loss": 0.13205139338970184, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "ref_ce_loss": 0.040802810341119766, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "loss": 0.33471447229385376, + "step": 20060 + }, + { + "ce_loss": 0.025919100269675255, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "distill_loss": 0.158289834856987, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "ref_ce_loss": 0.0653957799077034, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "loss": 0.2894524037837982, + "step": 20060 + }, + { + "ce_loss": 0.04996928945183754, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "distill_loss": 0.11954309046268463, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "ref_ce_loss": 0.05506157875061035, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "loss": 0.22797024250030518, + "step": 20060 + }, + { + "ce_loss": 0.03314273804426193, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "distill_loss": 0.10626078397035599, + "epoch": 6.691127418278852, + "step": 20060 + }, + { + "epoch": 6.691127418278852, + "ref_ce_loss": 0.0642261654138565, + "step": 20060 + }, + { + "epoch": 6.694462975316878, + "loss": 0.3243, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "grad_norm": 3.6613128185272217, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "learning_rate": 2.046914297295664e-05, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "loss": 0.20102205872535706, + "step": 20070 + }, + { + "ce_loss": 0.03726993128657341, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "distill_loss": 0.10248197615146637, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "ref_ce_loss": 0.047353606671094894, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "loss": 0.17317506670951843, + "step": 20070 + }, + { + "ce_loss": 0.005286495666950941, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "distill_loss": 0.10332041233778, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "ref_ce_loss": 0.06435637921094894, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "loss": 0.5771521329879761, + "step": 20070 + }, + { + "ce_loss": 0.04414547234773636, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "distill_loss": 0.24496690928936005, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "ref_ce_loss": 0.0880177766084671, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "loss": 0.4186633229255676, + "step": 20070 + }, + { + "ce_loss": 0.029012061655521393, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "distill_loss": 0.14431175589561462, + "epoch": 6.694462975316878, + "step": 20070 + }, + { + "epoch": 6.694462975316878, + "ref_ce_loss": 0.04155807942152023, + "step": 20070 + }, + { + "epoch": 6.697798532354903, + "loss": 0.3458, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "grad_norm": 3.196727752685547, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "learning_rate": 2.0367113034517564e-05, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "loss": 0.2703838050365448, + "step": 20080 + }, + { + "ce_loss": 0.05823126807808876, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "distill_loss": 0.11969301104545593, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "ref_ce_loss": 0.07414967566728592, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "loss": 0.3451869487762451, + "step": 20080 + }, + { + "ce_loss": 0.07295867055654526, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "distill_loss": 0.1378656029701233, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "ref_ce_loss": 0.07279222458600998, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "loss": 0.2282976359128952, + "step": 20080 + }, + { + "ce_loss": 0.043244145810604095, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "distill_loss": 0.11198434233665466, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "ref_ce_loss": 0.0550074465572834, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "loss": 0.4151057004928589, + "step": 20080 + }, + { + "ce_loss": 0.11792191118001938, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "distill_loss": 0.19072198867797852, + "epoch": 6.697798532354903, + "step": 20080 + }, + { + "epoch": 6.697798532354903, + "ref_ce_loss": 0.10631078481674194, + "step": 20080 + }, + { + "epoch": 6.701134089392928, + "loss": 0.3425, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "grad_norm": 2.2497243881225586, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "learning_rate": 2.0265319495328326e-05, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "loss": 0.5373490452766418, + "step": 20090 + }, + { + "ce_loss": 0.02349102683365345, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "distill_loss": 0.21344724297523499, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "ref_ce_loss": 0.08333466947078705, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "loss": 0.24514442682266235, + "step": 20090 + }, + { + "ce_loss": 0.03692420944571495, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "distill_loss": 0.1548544466495514, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "ref_ce_loss": 0.036220796406269073, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "loss": 0.191425621509552, + "step": 20090 + }, + { + "ce_loss": 0.04407431185245514, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "distill_loss": 0.09822914004325867, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "ref_ce_loss": 0.03383997455239296, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "loss": 0.31227219104766846, + "step": 20090 + }, + { + "ce_loss": 0.08893024921417236, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "distill_loss": 0.17280413210391998, + "epoch": 6.701134089392928, + "step": 20090 + }, + { + "epoch": 6.701134089392928, + "ref_ce_loss": 0.05040694400668144, + "step": 20090 + }, + { + "epoch": 6.704469646430954, + "loss": 0.3198, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "grad_norm": 2.1753933429718018, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "learning_rate": 2.0163762541020124e-05, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "loss": 0.35048708319664, + "step": 20100 + }, + { + "ce_loss": 0.10514289140701294, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "distill_loss": 0.18074220418930054, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "ref_ce_loss": 0.0645001083612442, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "loss": 0.32312288880348206, + "step": 20100 + }, + { + "ce_loss": 0.04791853949427605, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "distill_loss": 0.18330535292625427, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "ref_ce_loss": 0.041524119675159454, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "loss": 0.35315945744514465, + "step": 20100 + }, + { + "ce_loss": 0.06787319481372833, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "distill_loss": 0.14994627237319946, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "ref_ce_loss": 0.05921891704201698, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "loss": 0.38501298427581787, + "step": 20100 + }, + { + "ce_loss": 0.0495845191180706, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "distill_loss": 0.12040823698043823, + "epoch": 6.704469646430954, + "step": 20100 + }, + { + "epoch": 6.704469646430954, + "ref_ce_loss": 0.06525509059429169, + "step": 20100 + }, + { + "epoch": 6.707805203468979, + "loss": 0.3196, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "grad_norm": 2.6369545459747314, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "learning_rate": 2.0062442356792864e-05, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "loss": 0.2736344039440155, + "step": 20110 + }, + { + "ce_loss": 0.03203265741467476, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "distill_loss": 0.127993643283844, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "ref_ce_loss": 0.0858127698302269, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "loss": 0.13974420726299286, + "step": 20110 + }, + { + "ce_loss": 0.0016820939490571618, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "distill_loss": 0.08246586471796036, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "ref_ce_loss": 0.02231876365840435, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "loss": 0.2535470724105835, + "step": 20110 + }, + { + "ce_loss": 0.03593399375677109, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "distill_loss": 0.1197575107216835, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "ref_ce_loss": 0.060538504272699356, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "loss": 0.2826269268989563, + "step": 20110 + }, + { + "ce_loss": 0.039776112884283066, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "distill_loss": 0.12120041996240616, + "epoch": 6.707805203468979, + "step": 20110 + }, + { + "epoch": 6.707805203468979, + "ref_ce_loss": 0.061351049691438675, + "step": 20110 + }, + { + "epoch": 6.711140760507004, + "loss": 0.3117, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "grad_norm": 3.368558168411255, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "learning_rate": 1.9961359127414578e-05, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "loss": 0.2674647271633148, + "step": 20120 + }, + { + "ce_loss": 0.02018035016953945, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "distill_loss": 0.13457518815994263, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "ref_ce_loss": 0.08921375125646591, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "loss": 0.20268602669239044, + "step": 20120 + }, + { + "ce_loss": 0.0070023308508098125, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "distill_loss": 0.13283786177635193, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "ref_ce_loss": 0.06247830390930176, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "loss": 0.47161468863487244, + "step": 20120 + }, + { + "ce_loss": 0.06164795160293579, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "distill_loss": 0.23084260523319244, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "ref_ce_loss": 0.09241283684968948, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "loss": 0.22700239717960358, + "step": 20120 + }, + { + "ce_loss": 0.03247182443737984, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "distill_loss": 0.12559150159358978, + "epoch": 6.711140760507004, + "step": 20120 + }, + { + "epoch": 6.711140760507004, + "ref_ce_loss": 0.0504399836063385, + "step": 20120 + }, + { + "epoch": 6.71447631754503, + "loss": 0.3401, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "grad_norm": 2.520934820175171, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "learning_rate": 1.9860513037221165e-05, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "loss": 0.24574658274650574, + "step": 20130 + }, + { + "ce_loss": 0.06673982739448547, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "distill_loss": 0.0998455137014389, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "ref_ce_loss": 0.04743276908993721, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "loss": 0.3920712471008301, + "step": 20130 + }, + { + "ce_loss": 0.10085289925336838, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "distill_loss": 0.15075841546058655, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "ref_ce_loss": 0.06611064076423645, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "loss": 0.5583319067955017, + "step": 20130 + }, + { + "ce_loss": 0.12576350569725037, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "distill_loss": 0.31589075922966003, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "ref_ce_loss": 0.08344506472349167, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "loss": 0.26720836758613586, + "step": 20130 + }, + { + "ce_loss": 0.06933542340993881, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "distill_loss": 0.1287502497434616, + "epoch": 6.71447631754503, + "step": 20130 + }, + { + "epoch": 6.71447631754503, + "ref_ce_loss": 0.04018561542034149, + "step": 20130 + }, + { + "epoch": 6.717811874583055, + "loss": 0.3339, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "grad_norm": 2.886352300643921, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "learning_rate": 1.9759904270116165e-05, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "loss": 0.18582403659820557, + "step": 20140 + }, + { + "ce_loss": 0.00801194366067648, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "distill_loss": 0.12590402364730835, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "ref_ce_loss": 0.05173363909125328, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "loss": 0.5001078248023987, + "step": 20140 + }, + { + "ce_loss": 0.07312473654747009, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "distill_loss": 0.31643715500831604, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "ref_ce_loss": 0.07059961557388306, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "loss": 0.39632368087768555, + "step": 20140 + }, + { + "ce_loss": 0.041875191032886505, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "distill_loss": 0.1932414025068283, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "ref_ce_loss": 0.07255880534648895, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "loss": 0.2731451988220215, + "step": 20140 + }, + { + "ce_loss": 0.02847844362258911, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "distill_loss": 0.1204526498913765, + "epoch": 6.717811874583055, + "step": 20140 + }, + { + "epoch": 6.717811874583055, + "ref_ce_loss": 0.03780105710029602, + "step": 20140 + }, + { + "epoch": 6.72114743162108, + "loss": 0.3203, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "grad_norm": 2.101719856262207, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "learning_rate": 1.9659533009570223e-05, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "loss": 0.4778059124946594, + "step": 20150 + }, + { + "ce_loss": 0.03960665687918663, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "distill_loss": 0.2315598428249359, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "ref_ce_loss": 0.055229682475328445, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "loss": 0.23675209283828735, + "step": 20150 + }, + { + "ce_loss": 0.04655850678682327, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "distill_loss": 0.10201462358236313, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "ref_ce_loss": 0.05079250782728195, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "loss": 0.47496479749679565, + "step": 20150 + }, + { + "ce_loss": 0.08386895060539246, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "distill_loss": 0.1764313131570816, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "ref_ce_loss": 0.08487064391374588, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "loss": 0.36193379759788513, + "step": 20150 + }, + { + "ce_loss": 0.09721614420413971, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "distill_loss": 0.1604432463645935, + "epoch": 6.72114743162108, + "step": 20150 + }, + { + "epoch": 6.72114743162108, + "ref_ce_loss": 0.07095256447792053, + "step": 20150 + }, + { + "epoch": 6.724482988659106, + "loss": 0.3354, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "grad_norm": 2.4998772144317627, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "learning_rate": 1.9559399438620916e-05, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "loss": 0.47663408517837524, + "step": 20160 + }, + { + "ce_loss": 0.05730144679546356, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "distill_loss": 0.263236403465271, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "ref_ce_loss": 0.10452139377593994, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "loss": 0.29100215435028076, + "step": 20160 + }, + { + "ce_loss": 0.01226563099771738, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "distill_loss": 0.09933969378471375, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "ref_ce_loss": 0.06125570833683014, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "loss": 0.19560547173023224, + "step": 20160 + }, + { + "ce_loss": 0.028779683634638786, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "distill_loss": 0.0916346088051796, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "ref_ce_loss": 0.06244521588087082, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "loss": 0.4223164916038513, + "step": 20160 + }, + { + "ce_loss": 0.06593258678913116, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "distill_loss": 0.20690634846687317, + "epoch": 6.724482988659106, + "step": 20160 + }, + { + "epoch": 6.724482988659106, + "ref_ce_loss": 0.10990146547555923, + "step": 20160 + }, + { + "epoch": 6.727818545697131, + "loss": 0.3546, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "grad_norm": 2.427511692047119, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "learning_rate": 1.945950373987248e-05, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "loss": 0.24235740303993225, + "step": 20170 + }, + { + "ce_loss": 0.039137471467256546, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "distill_loss": 0.11441465467214584, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "ref_ce_loss": 0.06497035175561905, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "loss": 0.3280067443847656, + "step": 20170 + }, + { + "ce_loss": 0.04011811688542366, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "distill_loss": 0.10386212170124054, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "ref_ce_loss": 0.0749388188123703, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "loss": 0.2595832645893097, + "step": 20170 + }, + { + "ce_loss": 0.08086634427309036, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "distill_loss": 0.1273350715637207, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "ref_ce_loss": 0.042277704924345016, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "loss": 0.2752918004989624, + "step": 20170 + }, + { + "ce_loss": 0.024764977395534515, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "distill_loss": 0.12322026491165161, + "epoch": 6.727818545697131, + "step": 20170 + }, + { + "epoch": 6.727818545697131, + "ref_ce_loss": 0.058638669550418854, + "step": 20170 + }, + { + "epoch": 6.7311541027351565, + "loss": 0.2855, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "grad_norm": 2.475968360900879, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "learning_rate": 1.9359846095495158e-05, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "loss": 0.31172600388526917, + "step": 20180 + }, + { + "ce_loss": 0.07925920933485031, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "distill_loss": 0.1636563092470169, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "ref_ce_loss": 0.06868557631969452, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "loss": 0.41545963287353516, + "step": 20180 + }, + { + "ce_loss": 0.0213873703032732, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "distill_loss": 0.1410737931728363, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "ref_ce_loss": 0.07231228053569794, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "loss": 0.27609914541244507, + "step": 20180 + }, + { + "ce_loss": 0.05257976055145264, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "distill_loss": 0.1249610036611557, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "ref_ce_loss": 0.06813840568065643, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "loss": 0.2992923855781555, + "step": 20180 + }, + { + "ce_loss": 0.04836418479681015, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "distill_loss": 0.17673200368881226, + "epoch": 6.7311541027351565, + "step": 20180 + }, + { + "epoch": 6.7311541027351565, + "ref_ce_loss": 0.058475811034440994, + "step": 20180 + }, + { + "epoch": 6.734489659773182, + "loss": 0.3448, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "grad_norm": 2.1060166358947754, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "learning_rate": 1.926042668722526e-05, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "loss": 0.3200954496860504, + "step": 20190 + }, + { + "ce_loss": 0.05356490612030029, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "distill_loss": 0.19207656383514404, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "ref_ce_loss": 0.05510849133133888, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "loss": 0.3080518841743469, + "step": 20190 + }, + { + "ce_loss": 0.02909858524799347, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "distill_loss": 0.14246626198291779, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "ref_ce_loss": 0.055522508919239044, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "loss": 0.21634231507778168, + "step": 20190 + }, + { + "ce_loss": 0.039301078766584396, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "distill_loss": 0.12853223085403442, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "ref_ce_loss": 0.04776563495397568, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "loss": 0.22381868958473206, + "step": 20190 + }, + { + "ce_loss": 0.03490525484085083, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "distill_loss": 0.10595700889825821, + "epoch": 6.734489659773182, + "step": 20190 + }, + { + "epoch": 6.734489659773182, + "ref_ce_loss": 0.05174877867102623, + "step": 20190 + }, + { + "epoch": 6.737825216811207, + "loss": 0.2956, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "grad_norm": 1.833837866783142, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "learning_rate": 1.9161245696364514e-05, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "loss": 0.2744770348072052, + "step": 20200 + }, + { + "ce_loss": 0.034656401723623276, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "distill_loss": 0.13560622930526733, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "ref_ce_loss": 0.05984001234173775, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "loss": 0.3653131425380707, + "step": 20200 + }, + { + "ce_loss": 0.08240316063165665, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "distill_loss": 0.18237349390983582, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "ref_ce_loss": 0.07820425182580948, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "loss": 0.28199803829193115, + "step": 20200 + }, + { + "ce_loss": 0.024960942566394806, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "distill_loss": 0.13476698100566864, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "ref_ce_loss": 0.061238691210746765, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "loss": 0.42400720715522766, + "step": 20200 + }, + { + "ce_loss": 0.1105445921421051, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "distill_loss": 0.21091026067733765, + "epoch": 6.737825216811207, + "step": 20200 + }, + { + "epoch": 6.737825216811207, + "ref_ce_loss": 0.08224047720432281, + "step": 20200 + }, + { + "epoch": 6.7411607738492325, + "loss": 0.2882, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "grad_norm": 1.7699483633041382, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "learning_rate": 1.906230330377992e-05, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "loss": 0.3359181880950928, + "step": 20210 + }, + { + "ce_loss": 0.02041238360106945, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "distill_loss": 0.14281825721263885, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "ref_ce_loss": 0.08343903720378876, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "loss": 0.28383171558380127, + "step": 20210 + }, + { + "ce_loss": 0.03835010156035423, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "distill_loss": 0.12048612534999847, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "ref_ce_loss": 0.03666745126247406, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "loss": 0.26517578959465027, + "step": 20210 + }, + { + "ce_loss": 0.07440821081399918, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "distill_loss": 0.0954647958278656, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "ref_ce_loss": 0.05569697543978691, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "loss": 0.6115409731864929, + "step": 20210 + }, + { + "ce_loss": 0.024617154151201248, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "distill_loss": 0.310270220041275, + "epoch": 6.7411607738492325, + "step": 20210 + }, + { + "epoch": 6.7411607738492325, + "ref_ce_loss": 0.11816354840993881, + "step": 20210 + }, + { + "epoch": 6.744496330887258, + "loss": 0.3187, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "grad_norm": 2.3631792068481445, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "learning_rate": 1.8963599689903412e-05, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "loss": 0.17074748873710632, + "step": 20220 + }, + { + "ce_loss": 0.011627701111137867, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "distill_loss": 0.10479970276355743, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "ref_ce_loss": 0.04375007376074791, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "loss": 0.2588944733142853, + "step": 20220 + }, + { + "ce_loss": 0.016052883118391037, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "distill_loss": 0.21098734438419342, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "ref_ce_loss": 0.03143612667918205, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "loss": 0.25430601835250854, + "step": 20220 + }, + { + "ce_loss": 0.03086763434112072, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "distill_loss": 0.10999102145433426, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "ref_ce_loss": 0.04139406234025955, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "loss": 0.22506499290466309, + "step": 20220 + }, + { + "ce_loss": 0.02551484853029251, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "distill_loss": 0.11838529258966446, + "epoch": 6.744496330887258, + "step": 20220 + }, + { + "epoch": 6.744496330887258, + "ref_ce_loss": 0.048836853355169296, + "step": 20220 + }, + { + "epoch": 6.747831887925283, + "loss": 0.3145, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "grad_norm": 6.5614542961120605, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "learning_rate": 1.8865135034731416e-05, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "loss": 0.24494950473308563, + "step": 20230 + }, + { + "ce_loss": 0.03902106359601021, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "distill_loss": 0.1625560224056244, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "ref_ce_loss": 0.04319927096366882, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "loss": 0.4493194818496704, + "step": 20230 + }, + { + "ce_loss": 0.06314259022474289, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "distill_loss": 0.1928003877401352, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "ref_ce_loss": 0.06467821449041367, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "loss": 0.22811204195022583, + "step": 20230 + }, + { + "ce_loss": 0.029204674065113068, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "distill_loss": 0.11046045273542404, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "ref_ce_loss": 0.034495506435632706, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "loss": 0.24557246267795563, + "step": 20230 + }, + { + "ce_loss": 0.06584376096725464, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "distill_loss": 0.09908540546894073, + "epoch": 6.747831887925283, + "step": 20230 + }, + { + "epoch": 6.747831887925283, + "ref_ce_loss": 0.08043040335178375, + "step": 20230 + }, + { + "epoch": 6.751167444963309, + "loss": 0.3478, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "grad_norm": 3.3642992973327637, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "learning_rate": 1.876690951782464e-05, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "loss": 0.47724294662475586, + "step": 20240 + }, + { + "ce_loss": 0.03805273398756981, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "distill_loss": 0.25204703211784363, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "ref_ce_loss": 0.10946623235940933, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "loss": 0.36742380261421204, + "step": 20240 + }, + { + "ce_loss": 0.06362360715866089, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "distill_loss": 0.20535053312778473, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "ref_ce_loss": 0.0709865540266037, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "loss": 0.36857929825782776, + "step": 20240 + }, + { + "ce_loss": 0.06251908838748932, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "distill_loss": 0.12514689564704895, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "ref_ce_loss": 0.07997957617044449, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "loss": 0.3215661644935608, + "step": 20240 + }, + { + "ce_loss": 0.011648074723780155, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "distill_loss": 0.09662874042987823, + "epoch": 6.751167444963309, + "step": 20240 + }, + { + "epoch": 6.751167444963309, + "ref_ce_loss": 0.04865029454231262, + "step": 20240 + }, + { + "epoch": 6.754503002001334, + "loss": 0.3174, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "grad_norm": 3.293933153152466, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "learning_rate": 1.8668923318307704e-05, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "loss": 0.3013077676296234, + "step": 20250 + }, + { + "ce_loss": 0.0609404556453228, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "distill_loss": 0.12029371410608292, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "ref_ce_loss": 0.05728248506784439, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "loss": 0.36096930503845215, + "step": 20250 + }, + { + "ce_loss": 0.0662146508693695, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "distill_loss": 0.17438842356204987, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "ref_ce_loss": 0.07831903547048569, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "loss": 0.2404811829328537, + "step": 20250 + }, + { + "ce_loss": 0.031928323209285736, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "distill_loss": 0.10175152122974396, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "ref_ce_loss": 0.03808411583304405, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "loss": 0.2283913642168045, + "step": 20250 + }, + { + "ce_loss": 0.060152892023324966, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "distill_loss": 0.1293492317199707, + "epoch": 6.754503002001334, + "step": 20250 + }, + { + "epoch": 6.754503002001334, + "ref_ce_loss": 0.038714051246643066, + "step": 20250 + }, + { + "epoch": 6.757838559039359, + "loss": 0.395, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "grad_norm": 2.3881497383117676, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "learning_rate": 1.857117661486872e-05, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "loss": 0.26333901286125183, + "step": 20260 + }, + { + "ce_loss": 0.061003465205430984, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "distill_loss": 0.13676662743091583, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "ref_ce_loss": 0.06485778838396072, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "loss": 0.29492634534835815, + "step": 20260 + }, + { + "ce_loss": 0.00960566382855177, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "distill_loss": 0.15592965483665466, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "ref_ce_loss": 0.06057649105787277, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "loss": 0.25662752985954285, + "step": 20260 + }, + { + "ce_loss": 0.03762103244662285, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "distill_loss": 0.1429222673177719, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "ref_ce_loss": 0.07600852102041245, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "loss": 0.2905150055885315, + "step": 20260 + }, + { + "ce_loss": 0.03607148677110672, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "distill_loss": 0.15427818894386292, + "epoch": 6.757838559039359, + "step": 20260 + }, + { + "epoch": 6.757838559039359, + "ref_ce_loss": 0.05871858075261116, + "step": 20260 + }, + { + "epoch": 6.761174116077385, + "loss": 0.3381, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "grad_norm": 2.4188501834869385, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "learning_rate": 1.8473669585759154e-05, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "loss": 0.1986462026834488, + "step": 20270 + }, + { + "ce_loss": 0.03574904426932335, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "distill_loss": 0.1082804724574089, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "ref_ce_loss": 0.04469778388738632, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "loss": 0.28330090641975403, + "step": 20270 + }, + { + "ce_loss": 0.039637066423892975, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "distill_loss": 0.1479431688785553, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "ref_ce_loss": 0.045204803347587585, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "loss": 0.3502093553543091, + "step": 20270 + }, + { + "ce_loss": 0.08832836896181107, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "distill_loss": 0.1686418652534485, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "ref_ce_loss": 0.0489707887172699, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "loss": 0.3603600263595581, + "step": 20270 + }, + { + "ce_loss": 0.058100733906030655, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "distill_loss": 0.13964024186134338, + "epoch": 6.761174116077385, + "step": 20270 + }, + { + "epoch": 6.761174116077385, + "ref_ce_loss": 0.07787153869867325, + "step": 20270 + }, + { + "epoch": 6.76450967311541, + "loss": 0.3233, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "grad_norm": 2.8454596996307373, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "learning_rate": 1.837640240879335e-05, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "loss": 0.199926495552063, + "step": 20280 + }, + { + "ce_loss": 0.03727385401725769, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "distill_loss": 0.1146383136510849, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "ref_ce_loss": 0.04791300743818283, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "loss": 0.28659528493881226, + "step": 20280 + }, + { + "ce_loss": 0.030995670706033707, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "distill_loss": 0.1602456271648407, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "ref_ce_loss": 0.07240622490644455, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "loss": 0.4938913583755493, + "step": 20280 + }, + { + "ce_loss": 0.0463070385158062, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "distill_loss": 0.16774320602416992, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "ref_ce_loss": 0.061573222279548645, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "loss": 0.20674434304237366, + "step": 20280 + }, + { + "ce_loss": 0.02703999914228916, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "distill_loss": 0.10253699123859406, + "epoch": 6.76450967311541, + "step": 20280 + }, + { + "epoch": 6.76450967311541, + "ref_ce_loss": 0.057654231786727905, + "step": 20280 + }, + { + "epoch": 6.767845230153435, + "loss": 0.3143, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "grad_norm": 2.4084882736206055, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "learning_rate": 1.827937526134829e-05, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "loss": 0.2376161515712738, + "step": 20290 + }, + { + "ce_loss": 0.0391826331615448, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "distill_loss": 0.1317000836133957, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "ref_ce_loss": 0.046779513359069824, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "loss": 0.3904785215854645, + "step": 20290 + }, + { + "ce_loss": 0.05911394953727722, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "distill_loss": 0.14573772251605988, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "ref_ce_loss": 0.06968571990728378, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "loss": 0.3730168044567108, + "step": 20290 + }, + { + "ce_loss": 0.04287463426589966, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "distill_loss": 0.17994266748428345, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "ref_ce_loss": 0.0760711207985878, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "loss": 0.3494645953178406, + "step": 20290 + }, + { + "ce_loss": 0.058322224766016006, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "distill_loss": 0.18589213490486145, + "epoch": 6.767845230153435, + "step": 20290 + }, + { + "epoch": 6.767845230153435, + "ref_ce_loss": 0.077096126973629, + "step": 20290 + }, + { + "epoch": 6.771180787191461, + "loss": 0.3236, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "grad_norm": 3.896623134613037, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "learning_rate": 1.8182588320363234e-05, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "loss": 0.19223515689373016, + "step": 20300 + }, + { + "ce_loss": 0.022537581622600555, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "distill_loss": 0.11885391175746918, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "ref_ce_loss": 0.050274379551410675, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "loss": 0.3798726797103882, + "step": 20300 + }, + { + "ce_loss": 0.053281012922525406, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "distill_loss": 0.23629030585289001, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "ref_ce_loss": 0.06483253091573715, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "loss": 0.4262133240699768, + "step": 20300 + }, + { + "ce_loss": 0.0720096156001091, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "distill_loss": 0.11169244349002838, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "ref_ce_loss": 0.0754813551902771, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "loss": 0.21413297951221466, + "step": 20300 + }, + { + "ce_loss": 0.0175796989351511, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "distill_loss": 0.09827864170074463, + "epoch": 6.771180787191461, + "step": 20300 + }, + { + "epoch": 6.771180787191461, + "ref_ce_loss": 0.06181326135993004, + "step": 20300 + }, + { + "epoch": 6.774516344229486, + "loss": 0.33, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "grad_norm": 4.409674644470215, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "learning_rate": 1.808604176233933e-05, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "loss": 0.20254576206207275, + "step": 20310 + }, + { + "ce_loss": 0.025957494974136353, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "distill_loss": 0.12091059237718582, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "ref_ce_loss": 0.044557299464941025, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "loss": 0.3443145155906677, + "step": 20310 + }, + { + "ce_loss": 0.015703234821558, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "distill_loss": 0.1548217087984085, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "ref_ce_loss": 0.06314200907945633, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "loss": 0.2929725646972656, + "step": 20310 + }, + { + "ce_loss": 0.039958517998456955, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "distill_loss": 0.14566273987293243, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "ref_ce_loss": 0.04940671846270561, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "loss": 0.24490177631378174, + "step": 20310 + }, + { + "ce_loss": 0.03264036402106285, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "distill_loss": 0.1299162060022354, + "epoch": 6.774516344229486, + "step": 20310 + }, + { + "epoch": 6.774516344229486, + "ref_ce_loss": 0.06482992321252823, + "step": 20310 + }, + { + "epoch": 6.777851901267511, + "loss": 0.3209, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "grad_norm": 3.675053358078003, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "learning_rate": 1.798973576333943e-05, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "loss": 0.15585127472877502, + "step": 20320 + }, + { + "ce_loss": 0.0033037394750863314, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "distill_loss": 0.11441172659397125, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "ref_ce_loss": 0.037796296179294586, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "loss": 0.24352480471134186, + "step": 20320 + }, + { + "ce_loss": 0.01745920069515705, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "distill_loss": 0.10395500808954239, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "ref_ce_loss": 0.0610855370759964, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "loss": 0.2259272187948227, + "step": 20320 + }, + { + "ce_loss": 0.04140777140855789, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "distill_loss": 0.11032719910144806, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "ref_ce_loss": 0.046511076390743256, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "loss": 0.2978399991989136, + "step": 20320 + }, + { + "ce_loss": 0.04130599647760391, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "distill_loss": 0.18572106957435608, + "epoch": 6.777851901267511, + "step": 20320 + }, + { + "epoch": 6.777851901267511, + "ref_ce_loss": 0.07065480202436447, + "step": 20320 + }, + { + "epoch": 6.781187458305537, + "loss": 0.3517, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "grad_norm": 2.8553764820098877, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "learning_rate": 1.789367049898771e-05, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "loss": 0.3826456665992737, + "step": 20330 + }, + { + "ce_loss": 0.04621146619319916, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "distill_loss": 0.1796746551990509, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "ref_ce_loss": 0.04336797073483467, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "loss": 0.3907976746559143, + "step": 20330 + }, + { + "ce_loss": 0.0726345106959343, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "distill_loss": 0.1604585349559784, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "ref_ce_loss": 0.09365147352218628, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "loss": 0.3649190068244934, + "step": 20330 + }, + { + "ce_loss": 0.06374073773622513, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "distill_loss": 0.14042016863822937, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "ref_ce_loss": 0.05827249959111214, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "loss": 0.49203094840049744, + "step": 20330 + }, + { + "ce_loss": 0.12137826532125473, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "distill_loss": 0.22073110938072205, + "epoch": 6.781187458305537, + "step": 20330 + }, + { + "epoch": 6.781187458305537, + "ref_ce_loss": 0.12056776881217957, + "step": 20330 + }, + { + "epoch": 6.784523015343562, + "loss": 0.3236, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "grad_norm": 2.1215176582336426, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "learning_rate": 1.7797846144469306e-05, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "loss": 0.41949424147605896, + "step": 20340 + }, + { + "ce_loss": 0.09814513474702835, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "distill_loss": 0.20361030101776123, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "ref_ce_loss": 0.09809906035661697, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "loss": 0.19860303401947021, + "step": 20340 + }, + { + "ce_loss": 0.02330976165831089, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "distill_loss": 0.12827952206134796, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "ref_ce_loss": 0.03326123580336571, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "loss": 0.4274199903011322, + "step": 20340 + }, + { + "ce_loss": 0.030533771961927414, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "distill_loss": 0.1351586878299713, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "ref_ce_loss": 0.04878941550850868, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "loss": 0.2686753273010254, + "step": 20340 + }, + { + "ce_loss": 0.020406026393175125, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "distill_loss": 0.17345456779003143, + "epoch": 6.784523015343562, + "step": 20340 + }, + { + "epoch": 6.784523015343562, + "ref_ce_loss": 0.049186788499355316, + "step": 20340 + }, + { + "epoch": 6.787858572381587, + "loss": 0.3163, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "grad_norm": 3.0742602348327637, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "learning_rate": 1.770226287453007e-05, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "loss": 0.25368139147758484, + "step": 20350 + }, + { + "ce_loss": 0.02152234874665737, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "distill_loss": 0.16655249893665314, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "ref_ce_loss": 0.0653744712471962, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "loss": 1.1269481182098389, + "step": 20350 + }, + { + "ce_loss": 0.04487686976790428, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "distill_loss": 0.13662518560886383, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "ref_ce_loss": 0.05432640761137009, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "loss": 0.4653398394584656, + "step": 20350 + }, + { + "ce_loss": 0.04627220332622528, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "distill_loss": 0.17873713374137878, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "ref_ce_loss": 0.044496770948171616, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "loss": 0.4285845160484314, + "step": 20350 + }, + { + "ce_loss": 0.033552300184965134, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "distill_loss": 0.26186275482177734, + "epoch": 6.787858572381587, + "step": 20350 + }, + { + "epoch": 6.787858572381587, + "ref_ce_loss": 0.052286114543676376, + "step": 20350 + }, + { + "epoch": 6.791194129419613, + "loss": 0.3572, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "grad_norm": 2.3414089679718018, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "learning_rate": 1.760692086347612e-05, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "loss": 0.2956318259239197, + "step": 20360 + }, + { + "ce_loss": 0.013998754322528839, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "distill_loss": 0.17851464450359344, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "ref_ce_loss": 0.07696002721786499, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "loss": 0.23942914605140686, + "step": 20360 + }, + { + "ce_loss": 0.026676956564188004, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "distill_loss": 0.11965633928775787, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "ref_ce_loss": 0.035076532512903214, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "loss": 0.22385582327842712, + "step": 20360 + }, + { + "ce_loss": 0.03636830672621727, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "distill_loss": 0.1141805499792099, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "ref_ce_loss": 0.046886786818504333, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "loss": 0.36579465866088867, + "step": 20360 + }, + { + "ce_loss": 0.08306985348463058, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "distill_loss": 0.20233827829360962, + "epoch": 6.791194129419613, + "step": 20360 + }, + { + "epoch": 6.791194129419613, + "ref_ce_loss": 0.08025453239679337, + "step": 20360 + }, + { + "epoch": 6.794529686457638, + "loss": 0.3441, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "grad_norm": 2.2329018115997314, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "learning_rate": 1.751182028517373e-05, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "loss": 0.2719927430152893, + "step": 20370 + }, + { + "ce_loss": 0.061337120831012726, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "distill_loss": 0.13906854391098022, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "ref_ce_loss": 0.07132521271705627, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "loss": 0.31627902388572693, + "step": 20370 + }, + { + "ce_loss": 0.02854839526116848, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "distill_loss": 0.18262380361557007, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "ref_ce_loss": 0.07285098731517792, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "loss": 0.4150804579257965, + "step": 20370 + }, + { + "ce_loss": 0.06831822544336319, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "distill_loss": 0.2284497320652008, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "ref_ce_loss": 0.09520136564970016, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "loss": 0.43720874190330505, + "step": 20370 + }, + { + "ce_loss": 0.061582840979099274, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "distill_loss": 0.13514171540737152, + "epoch": 6.794529686457638, + "step": 20370 + }, + { + "epoch": 6.794529686457638, + "ref_ce_loss": 0.027682026848196983, + "step": 20370 + }, + { + "epoch": 6.7978652434956635, + "loss": 0.3417, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "grad_norm": 3.450296640396118, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "learning_rate": 1.7416961313048767e-05, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "loss": 0.3290284276008606, + "step": 20380 + }, + { + "ce_loss": 0.03511648252606392, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "distill_loss": 0.12331343442201614, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "ref_ce_loss": 0.062362030148506165, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "loss": 0.3422921895980835, + "step": 20380 + }, + { + "ce_loss": 0.05525599792599678, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "distill_loss": 0.10891838371753693, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "ref_ce_loss": 0.04750807583332062, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "loss": 0.33280348777770996, + "step": 20380 + }, + { + "ce_loss": 0.05837497115135193, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "distill_loss": 0.16548392176628113, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "ref_ce_loss": 0.06457653641700745, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "loss": 1.3425191640853882, + "step": 20380 + }, + { + "ce_loss": 0.01805054396390915, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "distill_loss": 0.13048408925533295, + "epoch": 6.7978652434956635, + "step": 20380 + }, + { + "epoch": 6.7978652434956635, + "ref_ce_loss": 0.0793338418006897, + "step": 20380 + }, + { + "epoch": 6.801200800533689, + "loss": 0.3615, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "grad_norm": 2.20592999458313, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "learning_rate": 1.7322344120086662e-05, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "loss": 0.33714401721954346, + "step": 20390 + }, + { + "ce_loss": 0.04753788188099861, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "distill_loss": 0.12098333984613419, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "ref_ce_loss": 0.06287845969200134, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "loss": 0.30119767785072327, + "step": 20390 + }, + { + "ce_loss": 0.01172428298741579, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "distill_loss": 0.11199269443750381, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "ref_ce_loss": 0.042432911694049835, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "loss": 0.4813525676727295, + "step": 20390 + }, + { + "ce_loss": 0.029576046392321587, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "distill_loss": 0.1122434064745903, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "ref_ce_loss": 0.05790295824408531, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "loss": 0.2784154415130615, + "step": 20390 + }, + { + "ce_loss": 0.07036291062831879, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "distill_loss": 0.15373992919921875, + "epoch": 6.801200800533689, + "step": 20390 + }, + { + "epoch": 6.801200800533689, + "ref_ce_loss": 0.044378459453582764, + "step": 20390 + }, + { + "epoch": 6.804536357571714, + "loss": 0.3143, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "grad_norm": 2.1802616119384766, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "learning_rate": 1.722796887883183e-05, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "loss": 0.2862167954444885, + "step": 20400 + }, + { + "ce_loss": 0.0310842152684927, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "distill_loss": 0.15729649364948273, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "ref_ce_loss": 0.053252577781677246, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "loss": 0.29880207777023315, + "step": 20400 + }, + { + "ce_loss": 0.06482298672199249, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "distill_loss": 0.1452362835407257, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "ref_ce_loss": 0.08833353221416473, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "loss": 0.7638775110244751, + "step": 20400 + }, + { + "ce_loss": 0.012667951174080372, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "distill_loss": 0.17049390077590942, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "ref_ce_loss": 0.09940537065267563, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "loss": 0.37731772661209106, + "step": 20400 + }, + { + "ce_loss": 0.029722534120082855, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "distill_loss": 0.24861417710781097, + "epoch": 6.804536357571714, + "step": 20400 + }, + { + "epoch": 6.804536357571714, + "ref_ce_loss": 0.07538484036922455, + "step": 20400 + }, + { + "epoch": 6.8078719146097395, + "loss": 0.346, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "grad_norm": 2.557342767715454, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "learning_rate": 1.713383576138746e-05, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "loss": 0.2678622603416443, + "step": 20410 + }, + { + "ce_loss": 0.05426609516143799, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "distill_loss": 0.1291498839855194, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "ref_ce_loss": 0.07145240902900696, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "loss": 0.30670714378356934, + "step": 20410 + }, + { + "ce_loss": 0.03392226621508598, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "distill_loss": 0.18343859910964966, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "ref_ce_loss": 0.056115660816431046, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "loss": 0.6002055406570435, + "step": 20410 + }, + { + "ce_loss": 0.013181916438043118, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "distill_loss": 0.25213515758514404, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "ref_ce_loss": 0.0515945628285408, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "loss": 0.29317811131477356, + "step": 20410 + }, + { + "ce_loss": 0.06708183139562607, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "distill_loss": 0.15257295966148376, + "epoch": 6.8078719146097395, + "step": 20410 + }, + { + "epoch": 6.8078719146097395, + "ref_ce_loss": 0.06174732744693756, + "step": 20410 + }, + { + "epoch": 6.811207471647765, + "loss": 0.3556, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "grad_norm": 3.1058154106140137, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "learning_rate": 1.703994493941523e-05, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "loss": 0.3794916868209839, + "step": 20420 + }, + { + "ce_loss": 0.038069769740104675, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "distill_loss": 0.1606229692697525, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "ref_ce_loss": 0.059208884835243225, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "loss": 0.31073516607284546, + "step": 20420 + }, + { + "ce_loss": 0.012998398393392563, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "distill_loss": 0.14370596408843994, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "ref_ce_loss": 0.059843819588422775, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "loss": 0.2577630281448364, + "step": 20420 + }, + { + "ce_loss": 0.04776522144675255, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "distill_loss": 0.099476657807827, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "ref_ce_loss": 0.060139674693346024, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "loss": 0.3549755811691284, + "step": 20420 + }, + { + "ce_loss": 0.04084332287311554, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "distill_loss": 0.11227469891309738, + "epoch": 6.811207471647765, + "step": 20420 + }, + { + "epoch": 6.811207471647765, + "ref_ce_loss": 0.06732767820358276, + "step": 20420 + }, + { + "epoch": 6.81454302868579, + "loss": 0.3226, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "grad_norm": 3.2220349311828613, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "learning_rate": 1.6946296584134988e-05, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "loss": 0.3655773103237152, + "step": 20430 + }, + { + "ce_loss": 0.01781701296567917, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "distill_loss": 0.27449363470077515, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "ref_ce_loss": 0.05400577932596207, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "loss": 0.283231258392334, + "step": 20430 + }, + { + "ce_loss": 0.024366069585084915, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "distill_loss": 0.17268988490104675, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "ref_ce_loss": 0.043967101722955704, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "loss": 0.31054821610450745, + "step": 20430 + }, + { + "ce_loss": 0.08263547718524933, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "distill_loss": 0.14180082082748413, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "ref_ce_loss": 0.07086239755153656, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "loss": 0.21523864567279816, + "step": 20430 + }, + { + "ce_loss": 0.017624469473958015, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "distill_loss": 0.14015556871891022, + "epoch": 6.81454302868579, + "step": 20430 + }, + { + "epoch": 6.81454302868579, + "ref_ce_loss": 0.04783370718359947, + "step": 20430 + }, + { + "epoch": 6.8178785857238156, + "loss": 0.3806, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "grad_norm": 1.9579981565475464, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "learning_rate": 1.685289086632433e-05, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "loss": 0.3771095275878906, + "step": 20440 + }, + { + "ce_loss": 0.07861079275608063, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "distill_loss": 0.21288646757602692, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "ref_ce_loss": 0.05587618425488472, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "loss": 0.2200825810432434, + "step": 20440 + }, + { + "ce_loss": 0.04254712909460068, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "distill_loss": 0.11262678354978561, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "ref_ce_loss": 0.04462994635105133, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "loss": 0.29384127259254456, + "step": 20440 + }, + { + "ce_loss": 0.03630302846431732, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "distill_loss": 0.11645830422639847, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "ref_ce_loss": 0.037373896688222885, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "loss": 0.39884600043296814, + "step": 20440 + }, + { + "ce_loss": 0.032204754650592804, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "distill_loss": 0.19181369245052338, + "epoch": 6.8178785857238156, + "step": 20440 + }, + { + "epoch": 6.8178785857238156, + "ref_ce_loss": 0.07446800172328949, + "step": 20440 + }, + { + "epoch": 6.821214142761841, + "loss": 0.3907, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "grad_norm": 3.3933627605438232, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "learning_rate": 1.6759727956318536e-05, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "loss": 0.16477397084236145, + "step": 20450 + }, + { + "ce_loss": 0.02636115998029709, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "distill_loss": 0.0941682681441307, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "ref_ce_loss": 0.03172001242637634, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "loss": 0.49552544951438904, + "step": 20450 + }, + { + "ce_loss": 0.06779889762401581, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "distill_loss": 0.1919066607952118, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "ref_ce_loss": 0.05548325926065445, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "loss": 0.293425977230072, + "step": 20450 + }, + { + "ce_loss": 0.039039529860019684, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "distill_loss": 0.18855513632297516, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "ref_ce_loss": 0.04807998239994049, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "loss": 0.359455406665802, + "step": 20450 + }, + { + "ce_loss": 0.05050094425678253, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "distill_loss": 0.15178292989730835, + "epoch": 6.821214142761841, + "step": 20450 + }, + { + "epoch": 6.821214142761841, + "ref_ce_loss": 0.039823852479457855, + "step": 20450 + }, + { + "epoch": 6.824549699799866, + "loss": 0.3371, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "grad_norm": 3.3244168758392334, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "learning_rate": 1.666680802400992e-05, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "loss": 0.3092981278896332, + "step": 20460 + }, + { + "ce_loss": 0.045066677033901215, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "distill_loss": 0.21357296407222748, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "ref_ce_loss": 0.03532247617840767, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "loss": 0.33261650800704956, + "step": 20460 + }, + { + "ce_loss": 0.04242473095655441, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "distill_loss": 0.16028018295764923, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "ref_ce_loss": 0.06327745318412781, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "loss": 0.38244354724884033, + "step": 20460 + }, + { + "ce_loss": 0.06900951266288757, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "distill_loss": 0.16052497923374176, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "ref_ce_loss": 0.06689080595970154, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "loss": 0.277946412563324, + "step": 20460 + }, + { + "ce_loss": 0.024767400696873665, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "distill_loss": 0.1381060630083084, + "epoch": 6.824549699799866, + "step": 20460 + }, + { + "epoch": 6.824549699799866, + "ref_ce_loss": 0.05307367444038391, + "step": 20460 + }, + { + "epoch": 6.827885256837892, + "loss": 0.3433, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "grad_norm": 4.6717658042907715, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "learning_rate": 1.657413123884782e-05, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "loss": 0.3429701626300812, + "step": 20470 + }, + { + "ce_loss": 0.05055519938468933, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "distill_loss": 0.20971356332302094, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "ref_ce_loss": 0.07031997293233871, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "loss": 1.1225718259811401, + "step": 20470 + }, + { + "ce_loss": 0.0169549398124218, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "distill_loss": 0.213147833943367, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "ref_ce_loss": 0.09352371096611023, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "loss": 1.0933277606964111, + "step": 20470 + }, + { + "ce_loss": 0.059152502566576004, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "distill_loss": 0.19744914770126343, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "ref_ce_loss": 0.06428895890712738, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "loss": 0.29990047216415405, + "step": 20470 + }, + { + "ce_loss": 0.030425040051341057, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "distill_loss": 0.1377599537372589, + "epoch": 6.827885256837892, + "step": 20470 + }, + { + "epoch": 6.827885256837892, + "ref_ce_loss": 0.057538341730833054, + "step": 20470 + }, + { + "epoch": 6.831220813875917, + "loss": 0.3842, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "grad_norm": 2.320603132247925, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "learning_rate": 1.6481697769838166e-05, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "loss": 0.13774797320365906, + "step": 20480 + }, + { + "ce_loss": 0.0031953605357557535, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "distill_loss": 0.09736062586307526, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "ref_ce_loss": 0.019617626443505287, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "loss": 0.14723575115203857, + "step": 20480 + }, + { + "ce_loss": 0.025522742420434952, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "distill_loss": 0.08433471620082855, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "ref_ce_loss": 0.03724829852581024, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "loss": 0.38259050250053406, + "step": 20480 + }, + { + "ce_loss": 0.0821247398853302, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "distill_loss": 0.19539779424667358, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "ref_ce_loss": 0.04475909844040871, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "loss": 0.3221023976802826, + "step": 20480 + }, + { + "ce_loss": 0.05226357653737068, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "distill_loss": 0.12409257888793945, + "epoch": 6.831220813875917, + "step": 20480 + }, + { + "epoch": 6.831220813875917, + "ref_ce_loss": 0.10269239544868469, + "step": 20480 + }, + { + "epoch": 6.834556370913942, + "loss": 0.3303, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "grad_norm": 2.2937734127044678, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "learning_rate": 1.6389507785543067e-05, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "loss": 0.3213595747947693, + "step": 20490 + }, + { + "ce_loss": 0.0633217841386795, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "distill_loss": 0.1517091989517212, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "ref_ce_loss": 0.060797836631536484, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "loss": 0.45498204231262207, + "step": 20490 + }, + { + "ce_loss": 0.09181714057922363, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "distill_loss": 0.2289491891860962, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "ref_ce_loss": 0.10269571840763092, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "loss": 0.4061187207698822, + "step": 20490 + }, + { + "ce_loss": 0.0254044346511364, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "distill_loss": 0.14261065423488617, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "ref_ce_loss": 0.06448842585086823, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "loss": 0.27845606207847595, + "step": 20490 + }, + { + "ce_loss": 0.08355095237493515, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "distill_loss": 0.11640045046806335, + "epoch": 6.834556370913942, + "step": 20490 + }, + { + "epoch": 6.834556370913942, + "ref_ce_loss": 0.0555226132273674, + "step": 20490 + }, + { + "epoch": 6.837891927951968, + "loss": 0.3147, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "grad_norm": 2.9679994583129883, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "learning_rate": 1.6297561454080727e-05, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "loss": 0.24702830612659454, + "step": 20500 + }, + { + "ce_loss": 0.05579338222742081, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "distill_loss": 0.12925627827644348, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "ref_ce_loss": 0.06171823665499687, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "loss": 0.30632027983665466, + "step": 20500 + }, + { + "ce_loss": 0.034926217049360275, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "distill_loss": 0.16829253733158112, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "ref_ce_loss": 0.0464189313352108, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "loss": 0.17535685002803802, + "step": 20500 + }, + { + "ce_loss": 0.0038707999046891928, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "distill_loss": 0.08704368770122528, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "ref_ce_loss": 0.04450593888759613, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "loss": 0.8304358720779419, + "step": 20500 + }, + { + "ce_loss": 0.09895811975002289, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "distill_loss": 0.14359821379184723, + "epoch": 6.837891927951968, + "step": 20500 + }, + { + "epoch": 6.837891927951968, + "ref_ce_loss": 0.05015580728650093, + "step": 20500 + }, + { + "epoch": 6.841227484989993, + "loss": 0.317, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "grad_norm": 1.9534635543823242, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "learning_rate": 1.6205858943125005e-05, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "loss": 0.39088451862335205, + "step": 20510 + }, + { + "ce_loss": 0.03835386037826538, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "distill_loss": 0.20981791615486145, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "ref_ce_loss": 0.06102800369262695, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "loss": 0.5545916557312012, + "step": 20510 + }, + { + "ce_loss": 0.07103858143091202, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "distill_loss": 0.17186136543750763, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "ref_ce_loss": 0.06757304817438126, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "loss": 0.1919550895690918, + "step": 20510 + }, + { + "ce_loss": 0.0106651084497571, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "distill_loss": 0.13799473643302917, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "ref_ce_loss": 0.02933264710009098, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "loss": 0.4210149049758911, + "step": 20510 + }, + { + "ce_loss": 0.015158873051404953, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "distill_loss": 0.18024934828281403, + "epoch": 6.841227484989993, + "step": 20510 + }, + { + "epoch": 6.841227484989993, + "ref_ce_loss": 0.06706501543521881, + "step": 20510 + }, + { + "epoch": 6.844563042028018, + "loss": 0.3569, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "grad_norm": 4.50359582901001, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "learning_rate": 1.6114400419905067e-05, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "loss": 0.35514190793037415, + "step": 20520 + }, + { + "ce_loss": 0.08249351382255554, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "distill_loss": 0.13066443800926208, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "ref_ce_loss": 0.041989509016275406, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "loss": 0.26543962955474854, + "step": 20520 + }, + { + "ce_loss": 0.0641234815120697, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "distill_loss": 0.14749526977539062, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "ref_ce_loss": 0.03656027466058731, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "loss": 0.29692205786705017, + "step": 20520 + }, + { + "ce_loss": 0.01707381382584572, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "distill_loss": 0.1400364339351654, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "ref_ce_loss": 0.06183823198080063, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "loss": 0.24452118575572968, + "step": 20520 + }, + { + "ce_loss": 0.027332574129104614, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "distill_loss": 0.14121589064598083, + "epoch": 6.844563042028018, + "step": 20520 + }, + { + "epoch": 6.844563042028018, + "ref_ce_loss": 0.05127198249101639, + "step": 20520 + }, + { + "epoch": 6.847898599066044, + "loss": 0.3207, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "grad_norm": 3.143054485321045, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "learning_rate": 1.6023186051205243e-05, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "loss": 0.29538893699645996, + "step": 20530 + }, + { + "ce_loss": 0.026194782927632332, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "distill_loss": 0.15457136929035187, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "ref_ce_loss": 0.046603668481111526, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "loss": 0.29159462451934814, + "step": 20530 + }, + { + "ce_loss": 0.01935706101357937, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "distill_loss": 0.1832038313150406, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "ref_ce_loss": 0.04731707647442818, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "loss": 0.4152846038341522, + "step": 20530 + }, + { + "ce_loss": 0.02133793942630291, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "distill_loss": 0.1754729300737381, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "ref_ce_loss": 0.0643564984202385, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "loss": 0.35213616490364075, + "step": 20530 + }, + { + "ce_loss": 0.028784506022930145, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "distill_loss": 0.17917081713676453, + "epoch": 6.847898599066044, + "step": 20530 + }, + { + "epoch": 6.847898599066044, + "ref_ce_loss": 0.06949307024478912, + "step": 20530 + }, + { + "epoch": 6.851234156104069, + "loss": 0.383, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "grad_norm": 5.041261196136475, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "learning_rate": 1.59322160033645e-05, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "loss": 0.3897348940372467, + "step": 20540 + }, + { + "ce_loss": 0.045651961117982864, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "distill_loss": 0.2878296971321106, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "ref_ce_loss": 0.05607311427593231, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "loss": 0.36378687620162964, + "step": 20540 + }, + { + "ce_loss": 0.056211087852716446, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "distill_loss": 0.2356104701757431, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "ref_ce_loss": 0.05061378329992294, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "loss": 0.3529844284057617, + "step": 20540 + }, + { + "ce_loss": 0.02915358357131481, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "distill_loss": 0.15124404430389404, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "ref_ce_loss": 0.08586305379867554, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "loss": 0.3024942874908447, + "step": 20540 + }, + { + "ce_loss": 0.05238991603255272, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "distill_loss": 0.14373472332954407, + "epoch": 6.851234156104069, + "step": 20540 + }, + { + "epoch": 6.851234156104069, + "ref_ce_loss": 0.057221703231334686, + "step": 20540 + }, + { + "epoch": 6.854569713142094, + "loss": 0.3917, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "grad_norm": 7.488528728485107, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "learning_rate": 1.5841490442276332e-05, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "loss": 0.41819265484809875, + "step": 20550 + }, + { + "ce_loss": 0.018714766949415207, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "distill_loss": 0.3183062672615051, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "ref_ce_loss": 0.03689192607998848, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "loss": 0.36515775322914124, + "step": 20550 + }, + { + "ce_loss": 0.08507240563631058, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "distill_loss": 0.14640793204307556, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "ref_ce_loss": 0.05818329378962517, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "loss": 0.7347936034202576, + "step": 20550 + }, + { + "ce_loss": 0.09090854972600937, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "distill_loss": 0.41110169887542725, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "ref_ce_loss": 0.0624711699783802, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "loss": 0.33083295822143555, + "step": 20550 + }, + { + "ce_loss": 0.034476906061172485, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "distill_loss": 0.14133962988853455, + "epoch": 6.854569713142094, + "step": 20550 + }, + { + "epoch": 6.854569713142094, + "ref_ce_loss": 0.05410853400826454, + "step": 20550 + }, + { + "epoch": 6.85790527018012, + "loss": 0.4608, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "grad_norm": 8.407671928405762, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "learning_rate": 1.575100953338838e-05, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "loss": 0.39217668771743774, + "step": 20560 + }, + { + "ce_loss": 0.049609459936618805, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "distill_loss": 0.2388307750225067, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "ref_ce_loss": 0.055172622203826904, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "loss": 0.503061056137085, + "step": 20560 + }, + { + "ce_loss": 0.029303135350346565, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "distill_loss": 0.3383800983428955, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "ref_ce_loss": 0.043822310864925385, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "loss": 0.37875670194625854, + "step": 20560 + }, + { + "ce_loss": 0.018401410430669785, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "distill_loss": 0.28223758935928345, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "ref_ce_loss": 0.06798820197582245, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "loss": 0.45275554060935974, + "step": 20560 + }, + { + "ce_loss": 0.05281388759613037, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "distill_loss": 0.3191457688808441, + "epoch": 6.85790527018012, + "step": 20560 + }, + { + "epoch": 6.85790527018012, + "ref_ce_loss": 0.05630598962306976, + "step": 20560 + }, + { + "epoch": 6.861240827218145, + "loss": 0.5633, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "grad_norm": 7.339962482452393, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "learning_rate": 1.566077344170214e-05, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "loss": 0.2997542917728424, + "step": 20570 + }, + { + "ce_loss": 0.01282829511910677, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "distill_loss": 0.21651379764080048, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "ref_ce_loss": 0.0550638884305954, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "loss": 0.7980534434318542, + "step": 20570 + }, + { + "ce_loss": 0.06555254012346268, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "distill_loss": 0.702267050743103, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "ref_ce_loss": 0.030158137902617455, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "loss": 1.2472227811813354, + "step": 20570 + }, + { + "ce_loss": 0.11841049045324326, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "distill_loss": 0.9726265072822571, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "ref_ce_loss": 0.08595526963472366, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "loss": 0.608909010887146, + "step": 20570 + }, + { + "ce_loss": 0.08853866904973984, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "distill_loss": 0.4263076186180115, + "epoch": 6.861240827218145, + "step": 20570 + }, + { + "epoch": 6.861240827218145, + "ref_ce_loss": 0.05566973239183426, + "step": 20570 + }, + { + "epoch": 6.8645763842561704, + "loss": 0.633, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "grad_norm": 9.359967231750488, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "learning_rate": 1.557078233177268e-05, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "loss": 0.37335649132728577, + "step": 20580 + }, + { + "ce_loss": 0.04577264189720154, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "distill_loss": 0.2800779938697815, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "ref_ce_loss": 0.04734800010919571, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "loss": 0.6135170459747314, + "step": 20580 + }, + { + "ce_loss": 0.0938236191868782, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "distill_loss": 0.4432787001132965, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "ref_ce_loss": 0.06203773617744446, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "loss": 0.5161895751953125, + "step": 20580 + }, + { + "ce_loss": 0.0030224386136978865, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "distill_loss": 0.33014774322509766, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "ref_ce_loss": 0.04769544303417206, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "loss": 0.5059229135513306, + "step": 20580 + }, + { + "ce_loss": 0.009971593506634235, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "distill_loss": 0.40563908219337463, + "epoch": 6.8645763842561704, + "step": 20580 + }, + { + "epoch": 6.8645763842561704, + "ref_ce_loss": 0.051717959344387054, + "step": 20580 + }, + { + "epoch": 6.867911941294196, + "loss": 0.4427, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "grad_norm": 6.026227951049805, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "learning_rate": 1.5481036367708212e-05, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "loss": 0.72898268699646, + "step": 20590 + }, + { + "ce_loss": 0.056139182299375534, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "distill_loss": 0.20178236067295074, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "ref_ce_loss": 0.06525614857673645, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "loss": 0.3133620023727417, + "step": 20590 + }, + { + "ce_loss": 0.03392108902335167, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "distill_loss": 0.15026448667049408, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "ref_ce_loss": 0.06486678868532181, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "loss": 0.31791892647743225, + "step": 20590 + }, + { + "ce_loss": 0.046934567391872406, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "distill_loss": 0.19896210730075836, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "ref_ce_loss": 0.05313270166516304, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "loss": 0.40088412165641785, + "step": 20590 + }, + { + "ce_loss": 0.04394695535302162, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "distill_loss": 0.24145759642124176, + "epoch": 6.867911941294196, + "step": 20590 + }, + { + "epoch": 6.867911941294196, + "ref_ce_loss": 0.04558607190847397, + "step": 20590 + }, + { + "epoch": 6.871247498332221, + "loss": 0.3944, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "grad_norm": 2.8247768878936768, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "learning_rate": 1.539153571317e-05, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "loss": 0.5353904962539673, + "step": 20600 + }, + { + "ce_loss": 0.08445299416780472, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "distill_loss": 0.3093477785587311, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "ref_ce_loss": 0.08602926880121231, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "loss": 0.29699471592903137, + "step": 20600 + }, + { + "ce_loss": 0.029342997819185257, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "distill_loss": 0.22840334475040436, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "ref_ce_loss": 0.0391845703125, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "loss": 0.20265303552150726, + "step": 20600 + }, + { + "ce_loss": 0.005748094525188208, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "distill_loss": 0.14643459022045135, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "ref_ce_loss": 0.05027634650468826, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "loss": 0.46097424626350403, + "step": 20600 + }, + { + "ce_loss": 0.04106662794947624, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "distill_loss": 0.2435900866985321, + "epoch": 6.871247498332221, + "step": 20600 + }, + { + "epoch": 6.871247498332221, + "ref_ce_loss": 0.03964756056666374, + "step": 20600 + }, + { + "epoch": 6.8745830553702465, + "loss": 0.5758, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "grad_norm": 15.702533721923828, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "learning_rate": 1.5302280531371957e-05, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "loss": 0.4692019820213318, + "step": 20610 + }, + { + "ce_loss": 0.039602942764759064, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "distill_loss": 0.373125821352005, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "ref_ce_loss": 0.046325571835041046, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "loss": 0.7239594459533691, + "step": 20610 + }, + { + "ce_loss": 0.018161263316869736, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "distill_loss": 0.6537948846817017, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "ref_ce_loss": 0.03868037089705467, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "loss": 0.6063088178634644, + "step": 20610 + }, + { + "ce_loss": 0.018849315121769905, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "distill_loss": 0.4054892659187317, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "ref_ce_loss": 0.05163077265024185, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "loss": 1.016980767250061, + "step": 20610 + }, + { + "ce_loss": 0.017590520903468132, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "distill_loss": 0.8820805549621582, + "epoch": 6.8745830553702465, + "step": 20610 + }, + { + "epoch": 6.8745830553702465, + "ref_ce_loss": 0.05267731845378876, + "step": 20610 + }, + { + "epoch": 6.877918612408272, + "loss": 0.7963, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "grad_norm": 11.103915214538574, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "learning_rate": 1.52132709850803e-05, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "loss": 1.7030434608459473, + "step": 20620 + }, + { + "ce_loss": 0.1394777148962021, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "distill_loss": 1.216463565826416, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "ref_ce_loss": 0.08501607924699783, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "loss": 0.8534746170043945, + "step": 20620 + }, + { + "ce_loss": 0.029901063069701195, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "distill_loss": 0.7174789309501648, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "ref_ce_loss": 0.05184415727853775, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "loss": 0.5241031646728516, + "step": 20620 + }, + { + "ce_loss": 0.029480108991265297, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "distill_loss": 0.342683345079422, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "ref_ce_loss": 0.0461571030318737, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "loss": 0.7048920392990112, + "step": 20620 + }, + { + "ce_loss": 0.04662850871682167, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "distill_loss": 0.5464339852333069, + "epoch": 6.877918612408272, + "step": 20620 + }, + { + "epoch": 6.877918612408272, + "ref_ce_loss": 0.07147201150655746, + "step": 20620 + }, + { + "epoch": 6.881254169446297, + "loss": 0.8145, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "grad_norm": 11.292054176330566, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "learning_rate": 1.512450723661337e-05, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "loss": 0.42995086312294006, + "step": 20630 + }, + { + "ce_loss": 0.011972256004810333, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "distill_loss": 0.36720559000968933, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "ref_ce_loss": 0.03696437552571297, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "loss": 0.5721352696418762, + "step": 20630 + }, + { + "ce_loss": 0.07270863652229309, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "distill_loss": 0.39840593934059143, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "ref_ce_loss": 0.07148131728172302, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "loss": 0.4161236882209778, + "step": 20630 + }, + { + "ce_loss": 0.025703487917780876, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "distill_loss": 0.3056744337081909, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "ref_ce_loss": 0.04110679775476456, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "loss": 0.43479883670806885, + "step": 20630 + }, + { + "ce_loss": 0.05821816995739937, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "distill_loss": 0.29935696721076965, + "epoch": 6.881254169446297, + "step": 20630 + }, + { + "epoch": 6.881254169446297, + "ref_ce_loss": 0.047724220901727676, + "step": 20630 + }, + { + "epoch": 6.8845897264843225, + "loss": 0.552, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "grad_norm": 17.464458465576172, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "learning_rate": 1.5035989447841167e-05, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "loss": 0.4306272566318512, + "step": 20640 + }, + { + "ce_loss": 0.049284566193819046, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "distill_loss": 0.28677845001220703, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "ref_ce_loss": 0.04678497090935707, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "loss": 0.32574254274368286, + "step": 20640 + }, + { + "ce_loss": 0.020122941583395004, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "distill_loss": 0.26398566365242004, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "ref_ce_loss": 0.027873439714312553, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "loss": 0.41913729906082153, + "step": 20640 + }, + { + "ce_loss": 0.035174764692783356, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "distill_loss": 0.3242431879043579, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "ref_ce_loss": 0.0594937726855278, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "loss": 1.0001294612884521, + "step": 20640 + }, + { + "ce_loss": 0.06655082106590271, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "distill_loss": 0.8158390522003174, + "epoch": 6.8845897264843225, + "step": 20640 + }, + { + "epoch": 6.8845897264843225, + "ref_ce_loss": 0.07071325927972794, + "step": 20640 + }, + { + "epoch": 6.887925283522348, + "loss": 0.7093, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "grad_norm": 16.102046966552734, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "learning_rate": 1.494771778018527e-05, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "loss": 0.5199611186981201, + "step": 20650 + }, + { + "ce_loss": 0.015108607709407806, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "distill_loss": 0.42118048667907715, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "ref_ce_loss": 0.08327829837799072, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "loss": 1.312179446220398, + "step": 20650 + }, + { + "ce_loss": 0.009233505465090275, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "distill_loss": 1.1099575757980347, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "ref_ce_loss": 0.0498458668589592, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "loss": 1.1651020050048828, + "step": 20650 + }, + { + "ce_loss": 0.00040370121132582426, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "distill_loss": 1.0097606182098389, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "ref_ce_loss": 0.03661251440644264, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "loss": 0.9241766929626465, + "step": 20650 + }, + { + "ce_loss": 0.018162589520215988, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "distill_loss": 0.8247155547142029, + "epoch": 6.887925283522348, + "step": 20650 + }, + { + "epoch": 6.887925283522348, + "ref_ce_loss": 0.055458761751651764, + "step": 20650 + }, + { + "epoch": 6.891260840560373, + "loss": 0.8902, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "grad_norm": 17.5844669342041, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "learning_rate": 1.4859692394618345e-05, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "loss": 0.3984909951686859, + "step": 20660 + }, + { + "ce_loss": 0.023830199614167213, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "distill_loss": 0.2762604355812073, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "ref_ce_loss": 0.04289623722434044, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "loss": 0.5686413645744324, + "step": 20660 + }, + { + "ce_loss": 0.06819921731948853, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "distill_loss": 0.4240104854106903, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "ref_ce_loss": 0.07634566724300385, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "loss": 0.9278061389923096, + "step": 20660 + }, + { + "ce_loss": 0.016938427463173866, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "distill_loss": 0.8398166298866272, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "ref_ce_loss": 0.070817731320858, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "loss": 0.5596387386322021, + "step": 20660 + }, + { + "ce_loss": 0.05967777222394943, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "distill_loss": 0.38053545355796814, + "epoch": 6.891260840560373, + "step": 20660 + }, + { + "epoch": 6.891260840560373, + "ref_ce_loss": 0.043331343680620193, + "step": 20660 + }, + { + "epoch": 6.894596397598399, + "loss": 0.5821, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "grad_norm": 15.2507963180542, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "learning_rate": 1.4771913451664002e-05, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "loss": 0.5702903866767883, + "step": 20670 + }, + { + "ce_loss": 0.028813892975449562, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "distill_loss": 0.4404188394546509, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "ref_ce_loss": 0.0773758515715599, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "loss": 0.4521728456020355, + "step": 20670 + }, + { + "ce_loss": 0.08183231204748154, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "distill_loss": 0.272499144077301, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "ref_ce_loss": 0.05865732207894325, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "loss": 0.5765916705131531, + "step": 20670 + }, + { + "ce_loss": 0.02798348292708397, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "distill_loss": 0.47397539019584656, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "ref_ce_loss": 0.0540350042283535, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "loss": 0.31577372550964355, + "step": 20670 + }, + { + "ce_loss": 0.05203654244542122, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "distill_loss": 0.1875409334897995, + "epoch": 6.894596397598399, + "step": 20670 + }, + { + "epoch": 6.894596397598399, + "ref_ce_loss": 0.054577991366386414, + "step": 20670 + }, + { + "epoch": 6.897931954636424, + "loss": 0.5992, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "grad_norm": 8.946640968322754, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "learning_rate": 1.4684381111396399e-05, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "loss": 0.40662744641304016, + "step": 20680 + }, + { + "ce_loss": 0.020231124013662338, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "distill_loss": 0.334001749753952, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "ref_ce_loss": 0.035733092576265335, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "loss": 0.4750587046146393, + "step": 20680 + }, + { + "ce_loss": 0.03923625871539116, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "distill_loss": 0.38633885979652405, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "ref_ce_loss": 0.04937080293893814, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "loss": 0.47791561484336853, + "step": 20680 + }, + { + "ce_loss": 0.052632320672273636, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "distill_loss": 0.35709503293037415, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "ref_ce_loss": 0.06743654608726501, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "loss": 0.8078065514564514, + "step": 20680 + }, + { + "ce_loss": 0.02854505181312561, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "distill_loss": 0.5855287313461304, + "epoch": 6.897931954636424, + "step": 20680 + }, + { + "epoch": 6.897931954636424, + "ref_ce_loss": 0.042745549231767654, + "step": 20680 + }, + { + "epoch": 6.901267511674449, + "loss": 0.6226, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "grad_norm": 17.90632438659668, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "learning_rate": 1.4597095533440013e-05, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "loss": 0.3882908225059509, + "step": 20690 + }, + { + "ce_loss": 0.06582961976528168, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "distill_loss": 0.23433640599250793, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "ref_ce_loss": 0.06076068431138992, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "loss": 0.8035255670547485, + "step": 20690 + }, + { + "ce_loss": 0.02489415742456913, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "distill_loss": 0.3288425803184509, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "ref_ce_loss": 0.07385779172182083, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "loss": 0.3329160809516907, + "step": 20690 + }, + { + "ce_loss": 0.03135690838098526, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "distill_loss": 0.2420273721218109, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "ref_ce_loss": 0.03833601996302605, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "loss": 0.4151855409145355, + "step": 20690 + }, + { + "ce_loss": 0.02253338135778904, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "distill_loss": 0.32237106561660767, + "epoch": 6.901267511674449, + "step": 20690 + }, + { + "epoch": 6.901267511674449, + "ref_ce_loss": 0.04350052401423454, + "step": 20690 + }, + { + "epoch": 6.904603068712475, + "loss": 0.531, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "grad_norm": 7.521162986755371, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "learning_rate": 1.4510056876969267e-05, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "loss": 0.8306207060813904, + "step": 20700 + }, + { + "ce_loss": 0.07492867112159729, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "distill_loss": 0.6766117215156555, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "ref_ce_loss": 0.07895965129137039, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "loss": 0.5224670767784119, + "step": 20700 + }, + { + "ce_loss": 0.08923482149839401, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "distill_loss": 0.33293861150741577, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "ref_ce_loss": 0.09829211235046387, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "loss": 0.4990532696247101, + "step": 20700 + }, + { + "ce_loss": 0.023325325921177864, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "distill_loss": 0.37632182240486145, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "ref_ce_loss": 0.06445571035146713, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "loss": 0.4500182271003723, + "step": 20700 + }, + { + "ce_loss": 0.05171177163720131, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "distill_loss": 0.24152307212352753, + "epoch": 6.904603068712475, + "step": 20700 + }, + { + "epoch": 6.904603068712475, + "ref_ce_loss": 0.06856939196586609, + "step": 20700 + }, + { + "epoch": 6.9079386257505, + "loss": 0.5049, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "grad_norm": 8.229083061218262, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "learning_rate": 1.442326530070838e-05, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "loss": 0.30293604731559753, + "step": 20710 + }, + { + "ce_loss": 0.024854473769664764, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "distill_loss": 0.16231058537960052, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "ref_ce_loss": 0.0645240843296051, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "loss": 0.8261449933052063, + "step": 20710 + }, + { + "ce_loss": 0.08644302934408188, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "distill_loss": 0.5393825769424438, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "ref_ce_loss": 0.10075093805789948, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "loss": 0.4293292164802551, + "step": 20710 + }, + { + "ce_loss": 0.033315710723400116, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "distill_loss": 0.2953912317752838, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "ref_ce_loss": 0.04508380591869354, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "loss": 0.3437563478946686, + "step": 20710 + }, + { + "ce_loss": 0.06788795441389084, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "distill_loss": 0.17214861512184143, + "epoch": 6.9079386257505, + "step": 20710 + }, + { + "epoch": 6.9079386257505, + "ref_ce_loss": 0.08690239489078522, + "step": 20710 + }, + { + "epoch": 6.911274182788525, + "loss": 0.4995, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "grad_norm": 8.721784591674805, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "learning_rate": 1.4336720962930898e-05, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "loss": 0.6756659746170044, + "step": 20720 + }, + { + "ce_loss": 0.008523699827492237, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "distill_loss": 0.25348472595214844, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "ref_ce_loss": 0.05715152993798256, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "loss": 0.7252782583236694, + "step": 20720 + }, + { + "ce_loss": 0.03596337139606476, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "distill_loss": 0.41875484585762024, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "ref_ce_loss": 0.08006498217582703, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "loss": 0.4181911051273346, + "step": 20720 + }, + { + "ce_loss": 0.009214845485985279, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "distill_loss": 0.3338942229747772, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "ref_ce_loss": 0.07484129816293716, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "loss": 0.3620077669620514, + "step": 20720 + }, + { + "ce_loss": 0.03972398489713669, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "distill_loss": 0.2782702147960663, + "epoch": 6.911274182788525, + "step": 20720 + }, + { + "epoch": 6.911274182788525, + "ref_ce_loss": 0.043749336153268814, + "step": 20720 + }, + { + "epoch": 6.914609739826551, + "loss": 0.439, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "grad_norm": 5.441989898681641, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "learning_rate": 1.4250424021459555e-05, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "loss": 0.30001750588417053, + "step": 20730 + }, + { + "ce_loss": 0.03059590421617031, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "distill_loss": 0.21454161405563354, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "ref_ce_loss": 0.05449133366346359, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "loss": 0.5982506275177002, + "step": 20730 + }, + { + "ce_loss": 0.03713531047105789, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "distill_loss": 0.4382067620754242, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "ref_ce_loss": 0.09281221032142639, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "loss": 0.4225163459777832, + "step": 20730 + }, + { + "ce_loss": 0.08537886291742325, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "distill_loss": 0.18929584324359894, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "ref_ce_loss": 0.05052082613110542, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "loss": 0.33002710342407227, + "step": 20730 + }, + { + "ce_loss": 0.037739742547273636, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "distill_loss": 0.21521805226802826, + "epoch": 6.914609739826551, + "step": 20730 + }, + { + "epoch": 6.914609739826551, + "ref_ce_loss": 0.07679290324449539, + "step": 20730 + }, + { + "epoch": 6.917945296864576, + "loss": 0.4035, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "grad_norm": 4.653440952301025, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "learning_rate": 1.4164374633666003e-05, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "loss": 0.5228537917137146, + "step": 20740 + }, + { + "ce_loss": 0.03656856343150139, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "distill_loss": 0.4247232675552368, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "ref_ce_loss": 0.06130426377058029, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "loss": 0.3749106526374817, + "step": 20740 + }, + { + "ce_loss": 0.02962685376405716, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "distill_loss": 0.27801570296287537, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "ref_ce_loss": 0.0668526366353035, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "loss": 0.17048296332359314, + "step": 20740 + }, + { + "ce_loss": 0.01074532326310873, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "distill_loss": 0.12260544300079346, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "ref_ce_loss": 0.03705614432692528, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "loss": 0.19853581488132477, + "step": 20740 + }, + { + "ce_loss": 0.005890983156859875, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "distill_loss": 0.1258002668619156, + "epoch": 6.917945296864576, + "step": 20740 + }, + { + "epoch": 6.917945296864576, + "ref_ce_loss": 0.04815744236111641, + "step": 20740 + }, + { + "epoch": 6.921280853902601, + "loss": 0.4412, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "grad_norm": 5.386718273162842, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "learning_rate": 1.4078572956470335e-05, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "loss": 0.5025739669799805, + "step": 20750 + }, + { + "ce_loss": 0.07032178342342377, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "distill_loss": 0.2361125499010086, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "ref_ce_loss": 0.04584193974733353, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "loss": 0.515607476234436, + "step": 20750 + }, + { + "ce_loss": 0.0668293833732605, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "distill_loss": 0.2832540273666382, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "ref_ce_loss": 0.07667690515518188, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "loss": 0.27646130323410034, + "step": 20750 + }, + { + "ce_loss": 0.009194553829729557, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "distill_loss": 0.16538158059120178, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "ref_ce_loss": 0.08040463179349899, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "loss": 0.20386376976966858, + "step": 20750 + }, + { + "ce_loss": 0.013103666715323925, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "distill_loss": 0.14098665118217468, + "epoch": 6.921280853902601, + "step": 20750 + }, + { + "epoch": 6.921280853902601, + "ref_ce_loss": 0.04962094500660896, + "step": 20750 + }, + { + "epoch": 6.924616410940627, + "loss": 0.4026, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "grad_norm": 6.593588352203369, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "learning_rate": 1.3993019146340973e-05, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "loss": 0.40435919165611267, + "step": 20760 + }, + { + "ce_loss": 0.06731601804494858, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "distill_loss": 0.20835278928279877, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "ref_ce_loss": 0.06101388484239578, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "loss": 0.6540699005126953, + "step": 20760 + }, + { + "ce_loss": 0.05264006927609444, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "distill_loss": 0.29627346992492676, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "ref_ce_loss": 0.0648992583155632, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "loss": 0.49436137080192566, + "step": 20760 + }, + { + "ce_loss": 0.0642567053437233, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "distill_loss": 0.18897266685962677, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "ref_ce_loss": 0.04494559019804001, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "loss": 0.2949737012386322, + "step": 20760 + }, + { + "ce_loss": 0.025369055569171906, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "distill_loss": 0.23227348923683167, + "epoch": 6.924616410940627, + "step": 20760 + }, + { + "epoch": 6.924616410940627, + "ref_ce_loss": 0.03722801432013512, + "step": 20760 + }, + { + "epoch": 6.927951967978652, + "loss": 0.4255, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "grad_norm": 3.854159116744995, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "learning_rate": 1.3907713359294298e-05, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "loss": 0.4830353260040283, + "step": 20770 + }, + { + "ce_loss": 0.046318408101797104, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "distill_loss": 0.37137317657470703, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "ref_ce_loss": 0.031674664467573166, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "loss": 0.44186148047447205, + "step": 20770 + }, + { + "ce_loss": 0.08817555755376816, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "distill_loss": 0.2676374316215515, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "ref_ce_loss": 0.06345969438552856, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "loss": 0.44447100162506104, + "step": 20770 + }, + { + "ce_loss": 0.02692546881735325, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "distill_loss": 0.32898080348968506, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "ref_ce_loss": 0.05204636603593826, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "loss": 0.35606223344802856, + "step": 20770 + }, + { + "ce_loss": 0.020126059651374817, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "distill_loss": 0.3049810826778412, + "epoch": 6.927951967978652, + "step": 20770 + }, + { + "epoch": 6.927951967978652, + "ref_ce_loss": 0.030778730288147926, + "step": 20770 + }, + { + "epoch": 6.931287525016677, + "loss": 0.457, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "grad_norm": 4.456582546234131, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "learning_rate": 1.3822655750894424e-05, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "loss": 0.44263705611228943, + "step": 20780 + }, + { + "ce_loss": 0.08875694870948792, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "distill_loss": 0.23786699771881104, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "ref_ce_loss": 0.04129364714026451, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "loss": 0.6732338666915894, + "step": 20780 + }, + { + "ce_loss": 0.05769633874297142, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "distill_loss": 0.277824729681015, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "ref_ce_loss": 0.07031594961881638, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "loss": 0.27141398191452026, + "step": 20780 + }, + { + "ce_loss": 0.013270096853375435, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "distill_loss": 0.15588274598121643, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "ref_ce_loss": 0.05156317353248596, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "loss": 0.45670467615127563, + "step": 20780 + }, + { + "ce_loss": 0.07102420181035995, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "distill_loss": 0.18294796347618103, + "epoch": 6.931287525016677, + "step": 20780 + }, + { + "epoch": 6.931287525016677, + "ref_ce_loss": 0.07500330358743668, + "step": 20780 + }, + { + "epoch": 6.934623082054703, + "loss": 0.4402, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "grad_norm": 7.2800397872924805, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "learning_rate": 1.3737846476252889e-05, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "loss": 0.28252097964286804, + "step": 20790 + }, + { + "ce_loss": 0.03667629882693291, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "distill_loss": 0.16667498648166656, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "ref_ce_loss": 0.05718651041388512, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "loss": 0.2800046503543854, + "step": 20790 + }, + { + "ce_loss": 0.029736505821347237, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "distill_loss": 0.11351748555898666, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "ref_ce_loss": 0.039131175726652145, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "loss": 0.5445693731307983, + "step": 20790 + }, + { + "ce_loss": 0.04326792433857918, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "distill_loss": 0.37276849150657654, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "ref_ce_loss": 0.043659619987010956, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "loss": 0.3070202171802521, + "step": 20790 + }, + { + "ce_loss": 0.022522946819663048, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "distill_loss": 0.18984946608543396, + "epoch": 6.934623082054703, + "step": 20790 + }, + { + "epoch": 6.934623082054703, + "ref_ce_loss": 0.07139455527067184, + "step": 20790 + }, + { + "epoch": 6.937958639092728, + "loss": 0.4202, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "grad_norm": 4.608834743499756, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "learning_rate": 1.3653285690028349e-05, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "loss": 0.6431030631065369, + "step": 20800 + }, + { + "ce_loss": 0.09889832884073257, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "distill_loss": 0.41358011960983276, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "ref_ce_loss": 0.09461617469787598, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "loss": 0.3524114787578583, + "step": 20800 + }, + { + "ce_loss": 0.036635737866163254, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "distill_loss": 0.20360025763511658, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "ref_ce_loss": 0.0815434455871582, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "loss": 0.2596745193004608, + "step": 20800 + }, + { + "ce_loss": 0.04510733485221863, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "distill_loss": 0.17303520441055298, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "ref_ce_loss": 0.04140667989850044, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "loss": 0.399715393781662, + "step": 20800 + }, + { + "ce_loss": 0.035806529223918915, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "distill_loss": 0.32043132185935974, + "epoch": 6.937958639092728, + "step": 20800 + }, + { + "epoch": 6.937958639092728, + "ref_ce_loss": 0.04334215074777603, + "step": 20800 + }, + { + "epoch": 6.9412941961307535, + "loss": 0.4666, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "grad_norm": 6.359853267669678, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "learning_rate": 1.3568973546426332e-05, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "loss": 0.9467858076095581, + "step": 20810 + }, + { + "ce_loss": 0.06935672461986542, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "distill_loss": 0.4101283848285675, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "ref_ce_loss": 0.06270965188741684, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "loss": 0.2544810473918915, + "step": 20810 + }, + { + "ce_loss": 0.015241993591189384, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "distill_loss": 0.14630302786827087, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "ref_ce_loss": 0.046901024878025055, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "loss": 0.3560320734977722, + "step": 20810 + }, + { + "ce_loss": 0.00998441968113184, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "distill_loss": 0.2614750266075134, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "ref_ce_loss": 0.028601357713341713, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "loss": 0.5675954818725586, + "step": 20810 + }, + { + "ce_loss": 0.027431802824139595, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "distill_loss": 0.36020371317863464, + "epoch": 6.9412941961307535, + "step": 20810 + }, + { + "epoch": 6.9412941961307535, + "ref_ce_loss": 0.06007641553878784, + "step": 20810 + }, + { + "epoch": 6.944629753168779, + "loss": 0.449, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "grad_norm": 4.561250686645508, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "learning_rate": 1.34849101991989e-05, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "loss": 0.4040166139602661, + "step": 20820 + }, + { + "ce_loss": 0.025941571220755577, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "distill_loss": 0.20139163732528687, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "ref_ce_loss": 0.0640379786491394, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "loss": 0.30845510959625244, + "step": 20820 + }, + { + "ce_loss": 0.04371127858757973, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "distill_loss": 0.204574316740036, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "ref_ce_loss": 0.04178668558597565, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "loss": 0.39555543661117554, + "step": 20820 + }, + { + "ce_loss": 0.03262713551521301, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "distill_loss": 0.2215277999639511, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "ref_ce_loss": 0.06563069671392441, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "loss": 0.45052284002304077, + "step": 20820 + }, + { + "ce_loss": 0.11583734303712845, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "distill_loss": 0.2701414227485657, + "epoch": 6.944629753168779, + "step": 20820 + }, + { + "epoch": 6.944629753168779, + "ref_ce_loss": 0.06444041430950165, + "step": 20820 + }, + { + "epoch": 6.947965310206804, + "loss": 0.4061, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "grad_norm": 4.293476581573486, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "learning_rate": 1.3401095801644462e-05, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "loss": 0.2711658775806427, + "step": 20830 + }, + { + "ce_loss": 0.06868616491556168, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "distill_loss": 0.13750752806663513, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "ref_ce_loss": 0.06487326323986053, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "loss": 0.34549322724342346, + "step": 20830 + }, + { + "ce_loss": 0.016219772398471832, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "distill_loss": 0.2700633108615875, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "ref_ce_loss": 0.059070926159620285, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "loss": 0.5564180612564087, + "step": 20830 + }, + { + "ce_loss": 0.024273596704006195, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "distill_loss": 0.3557685315608978, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "ref_ce_loss": 0.07073020935058594, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "loss": 0.3989897072315216, + "step": 20830 + }, + { + "ce_loss": 0.06166957691311836, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "distill_loss": 0.1681160181760788, + "epoch": 6.947965310206804, + "step": 20830 + }, + { + "epoch": 6.947965310206804, + "ref_ce_loss": 0.04308732599020004, + "step": 20830 + }, + { + "epoch": 6.9513008672448295, + "loss": 0.4508, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "grad_norm": 4.196435451507568, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "learning_rate": 1.3317530506607405e-05, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "loss": 0.30096614360809326, + "step": 20840 + }, + { + "ce_loss": 0.03617022559046745, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "distill_loss": 0.2046215534210205, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "ref_ce_loss": 0.0513840913772583, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "loss": 0.4168870747089386, + "step": 20840 + }, + { + "ce_loss": 0.02515067718923092, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "distill_loss": 0.3294127583503723, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "ref_ce_loss": 0.05164066329598427, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "loss": 0.3690844774246216, + "step": 20840 + }, + { + "ce_loss": 0.012738215737044811, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "distill_loss": 0.25531935691833496, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "ref_ce_loss": 0.06127219274640083, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "loss": 0.563440203666687, + "step": 20840 + }, + { + "ce_loss": 0.023211704567074776, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "distill_loss": 0.3891507387161255, + "epoch": 6.9513008672448295, + "step": 20840 + }, + { + "epoch": 6.9513008672448295, + "ref_ce_loss": 0.06096247583627701, + "step": 20840 + }, + { + "epoch": 6.954636424282855, + "loss": 0.4044, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "grad_norm": 4.023589134216309, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "learning_rate": 1.3234214466477877e-05, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "loss": 0.28012433648109436, + "step": 20850 + }, + { + "ce_loss": 0.03320290520787239, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "distill_loss": 0.19837743043899536, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "ref_ce_loss": 0.037537477910518646, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "loss": 0.37420663237571716, + "step": 20850 + }, + { + "ce_loss": 0.05524859204888344, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "distill_loss": 0.2331634759902954, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "ref_ce_loss": 0.04749821871519089, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "loss": 0.46007564663887024, + "step": 20850 + }, + { + "ce_loss": 0.060388337820768356, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "distill_loss": 0.3201514184474945, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "ref_ce_loss": 0.05457288771867752, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "loss": 0.2581188380718231, + "step": 20850 + }, + { + "ce_loss": 0.03130757808685303, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "distill_loss": 0.162839874625206, + "epoch": 6.954636424282855, + "step": 20850 + }, + { + "epoch": 6.954636424282855, + "ref_ce_loss": 0.05107346549630165, + "step": 20850 + }, + { + "epoch": 6.95797198132088, + "loss": 0.399, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "grad_norm": 5.500354766845703, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "learning_rate": 1.315114783319146e-05, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "loss": 0.8441788554191589, + "step": 20860 + }, + { + "ce_loss": 0.12463463097810745, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "distill_loss": 0.37730810046195984, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "ref_ce_loss": 0.08145100623369217, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "loss": 0.7428448796272278, + "step": 20860 + }, + { + "ce_loss": 0.0943167582154274, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "distill_loss": 0.20089185237884521, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "ref_ce_loss": 0.11132627725601196, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "loss": 0.43067431449890137, + "step": 20860 + }, + { + "ce_loss": 0.04894329980015755, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "distill_loss": 0.3266683518886566, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "ref_ce_loss": 0.05494893714785576, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "loss": 0.4565647840499878, + "step": 20860 + }, + { + "ce_loss": 0.06725724786520004, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "distill_loss": 0.2948342561721802, + "epoch": 6.95797198132088, + "step": 20860 + }, + { + "epoch": 6.95797198132088, + "ref_ce_loss": 0.06600886583328247, + "step": 20860 + }, + { + "epoch": 6.961307538358906, + "loss": 0.4097, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "grad_norm": 5.8303399085998535, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "learning_rate": 1.3068330758228951e-05, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "loss": 0.498879075050354, + "step": 20870 + }, + { + "ce_loss": 0.044609811156988144, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "distill_loss": 0.21364165842533112, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "ref_ce_loss": 0.07443445175886154, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "loss": 0.6392194628715515, + "step": 20870 + }, + { + "ce_loss": 0.0752585232257843, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "distill_loss": 0.21459460258483887, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "ref_ce_loss": 0.049674663692712784, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "loss": 0.3434360921382904, + "step": 20870 + }, + { + "ce_loss": 0.036089323461055756, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "distill_loss": 0.17388133704662323, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "ref_ce_loss": 0.05977735295891762, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "loss": 0.6956120133399963, + "step": 20870 + }, + { + "ce_loss": 0.08760103583335876, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "distill_loss": 0.4955027997493744, + "epoch": 6.961307538358906, + "step": 20870 + }, + { + "epoch": 6.961307538358906, + "ref_ce_loss": 0.0775177851319313, + "step": 20870 + }, + { + "epoch": 6.964643095396931, + "loss": 0.3955, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "grad_norm": 4.173266887664795, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "learning_rate": 1.2985763392615972e-05, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "loss": 0.21807421743869781, + "step": 20880 + }, + { + "ce_loss": 0.03225923329591751, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "distill_loss": 0.143633171916008, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "ref_ce_loss": 0.042038701474666595, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "loss": 0.7549881339073181, + "step": 20880 + }, + { + "ce_loss": 0.04640970006585121, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "distill_loss": 0.24397540092468262, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "ref_ce_loss": 0.04105795919895172, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "loss": 0.28283512592315674, + "step": 20880 + }, + { + "ce_loss": 0.057034965604543686, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "distill_loss": 0.1521199345588684, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "ref_ce_loss": 0.048486873507499695, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "loss": 0.46666327118873596, + "step": 20880 + }, + { + "ce_loss": 0.085448257625103, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "distill_loss": 0.228860542178154, + "epoch": 6.964643095396931, + "step": 20880 + }, + { + "epoch": 6.964643095396931, + "ref_ce_loss": 0.06538163125514984, + "step": 20880 + }, + { + "epoch": 6.967978652434956, + "loss": 0.3826, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "grad_norm": 4.615096092224121, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "learning_rate": 1.2903445886922863e-05, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "loss": 0.47802993655204773, + "step": 20890 + }, + { + "ce_loss": 0.07998108118772507, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "distill_loss": 0.18245376646518707, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "ref_ce_loss": 0.09586384892463684, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "loss": 0.2591511607170105, + "step": 20890 + }, + { + "ce_loss": 0.02041383646428585, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "distill_loss": 0.18587136268615723, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "ref_ce_loss": 0.05260073021054268, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "loss": 0.27677682042121887, + "step": 20890 + }, + { + "ce_loss": 0.02615499682724476, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "distill_loss": 0.14043205976486206, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "ref_ce_loss": 0.046882420778274536, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "loss": 0.3209933936595917, + "step": 20890 + }, + { + "ce_loss": 0.03233850374817848, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "distill_loss": 0.14990727603435516, + "epoch": 6.967978652434956, + "step": 20890 + }, + { + "epoch": 6.967978652434956, + "ref_ce_loss": 0.052740324288606644, + "step": 20890 + }, + { + "epoch": 6.971314209472982, + "loss": 0.4014, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "grad_norm": 5.587108135223389, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "learning_rate": 1.2821378391264282e-05, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "loss": 0.3331134617328644, + "step": 20900 + }, + { + "ce_loss": 0.013039899058640003, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "distill_loss": 0.23820891976356506, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "ref_ce_loss": 0.05505063757300377, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "loss": 0.31397545337677, + "step": 20900 + }, + { + "ce_loss": 0.026968948543071747, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "distill_loss": 0.10655680298805237, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "ref_ce_loss": 0.050659481436014175, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "loss": 0.3516398072242737, + "step": 20900 + }, + { + "ce_loss": 0.018841532990336418, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "distill_loss": 0.12434244155883789, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "ref_ce_loss": 0.04103760048747063, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "loss": 0.3670431971549988, + "step": 20900 + }, + { + "ce_loss": 0.01649991050362587, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "distill_loss": 0.23683831095695496, + "epoch": 6.971314209472982, + "step": 20900 + }, + { + "epoch": 6.971314209472982, + "ref_ce_loss": 0.07243819534778595, + "step": 20900 + }, + { + "epoch": 6.974649766511007, + "loss": 0.4017, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "grad_norm": 3.5513551235198975, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "learning_rate": 1.2739561055298975e-05, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "loss": 0.4680349826812744, + "step": 20910 + }, + { + "ce_loss": 0.06049255281686783, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "distill_loss": 0.2663050591945648, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "ref_ce_loss": 0.0641944482922554, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "loss": 0.6693120002746582, + "step": 20910 + }, + { + "ce_loss": 0.041746288537979126, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "distill_loss": 0.524785578250885, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "ref_ce_loss": 0.0707206279039383, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "loss": 0.5915324687957764, + "step": 20910 + }, + { + "ce_loss": 0.045784156769514084, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "distill_loss": 0.3815018832683563, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "ref_ce_loss": 0.08063600212335587, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "loss": 0.3671169877052307, + "step": 20910 + }, + { + "ce_loss": 0.026919331401586533, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "distill_loss": 0.30458635091781616, + "epoch": 6.974649766511007, + "step": 20910 + }, + { + "epoch": 6.974649766511007, + "ref_ce_loss": 0.035399653017520905, + "step": 20910 + }, + { + "epoch": 6.977985323549032, + "loss": 0.3908, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "grad_norm": 3.7871408462524414, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "learning_rate": 1.2657994028229496e-05, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "loss": 0.522492527961731, + "step": 20920 + }, + { + "ce_loss": 0.03481413051486015, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "distill_loss": 0.15558190643787384, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "ref_ce_loss": 0.04664992541074753, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "loss": 0.25579845905303955, + "step": 20920 + }, + { + "ce_loss": 0.03506955876946449, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "distill_loss": 0.13314028084278107, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "ref_ce_loss": 0.03844781219959259, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "loss": 0.3240671455860138, + "step": 20920 + }, + { + "ce_loss": 0.03473073989152908, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "distill_loss": 0.19969017803668976, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "ref_ce_loss": 0.06953977793455124, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "loss": 0.3197997808456421, + "step": 20920 + }, + { + "ce_loss": 0.01574970968067646, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "distill_loss": 0.2247905135154724, + "epoch": 6.977985323549032, + "step": 20920 + }, + { + "epoch": 6.977985323549032, + "ref_ce_loss": 0.059356823563575745, + "step": 20920 + }, + { + "epoch": 6.981320880587058, + "loss": 0.3878, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "grad_norm": 5.743426322937012, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "learning_rate": 1.2576677458801875e-05, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "loss": 0.36012518405914307, + "step": 20930 + }, + { + "ce_loss": 0.051608890295028687, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "distill_loss": 0.2507593631744385, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "ref_ce_loss": 0.057602979242801666, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "loss": 0.36070674657821655, + "step": 20930 + }, + { + "ce_loss": 0.010836776345968246, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "distill_loss": 0.22331926226615906, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "ref_ce_loss": 0.04523332417011261, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "loss": 0.3094327449798584, + "step": 20930 + }, + { + "ce_loss": 0.0389716662466526, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "distill_loss": 0.17145349085330963, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "ref_ce_loss": 0.05656014755368233, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "loss": 0.7913627028465271, + "step": 20930 + }, + { + "ce_loss": 0.0768100693821907, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "distill_loss": 0.25836631655693054, + "epoch": 6.981320880587058, + "step": 20930 + }, + { + "epoch": 6.981320880587058, + "ref_ce_loss": 0.10429731756448746, + "step": 20930 + }, + { + "epoch": 6.984656437625083, + "loss": 0.3749, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "grad_norm": 3.8919103145599365, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "learning_rate": 1.249561149530553e-05, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "loss": 0.4512990415096283, + "step": 20940 + }, + { + "ce_loss": 0.010347792878746986, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "distill_loss": 0.21304087340831757, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "ref_ce_loss": 0.07783834636211395, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "loss": 0.2394438087940216, + "step": 20940 + }, + { + "ce_loss": 0.020768312737345695, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "distill_loss": 0.17492659389972687, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "ref_ce_loss": 0.043548326939344406, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "loss": 0.45974916219711304, + "step": 20940 + }, + { + "ce_loss": 0.05422638729214668, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "distill_loss": 0.24236030876636505, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "ref_ce_loss": 0.04444364830851555, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "loss": 0.5540006756782532, + "step": 20940 + }, + { + "ce_loss": 0.09376584738492966, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "distill_loss": 0.3020194470882416, + "epoch": 6.984656437625083, + "step": 20940 + }, + { + "epoch": 6.984656437625083, + "ref_ce_loss": 0.055338360369205475, + "step": 20940 + }, + { + "epoch": 6.987991994663108, + "loss": 0.3807, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "grad_norm": 3.6167783737182617, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "learning_rate": 1.2414796285572704e-05, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "loss": 0.32541486620903015, + "step": 20950 + }, + { + "ce_loss": 0.0197074506431818, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "distill_loss": 0.17429247498512268, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "ref_ce_loss": 0.05691966041922569, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "loss": 0.4718301594257355, + "step": 20950 + }, + { + "ce_loss": 0.050296757370233536, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "distill_loss": 0.3400103449821472, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "ref_ce_loss": 0.08132241666316986, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "loss": 0.4324634075164795, + "step": 20950 + }, + { + "ce_loss": 0.008074936456978321, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "distill_loss": 0.24002067744731903, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "ref_ce_loss": 0.05884374678134918, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "loss": 0.42272958159446716, + "step": 20950 + }, + { + "ce_loss": 0.03418363630771637, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "distill_loss": 0.13855905830860138, + "epoch": 6.987991994663108, + "step": 20950 + }, + { + "epoch": 6.987991994663108, + "ref_ce_loss": 0.04546462744474411, + "step": 20950 + }, + { + "epoch": 6.991327551701134, + "loss": 0.3778, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "grad_norm": 4.432559967041016, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "learning_rate": 1.2334231976978543e-05, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "loss": 0.45632582902908325, + "step": 20960 + }, + { + "ce_loss": 0.06276565790176392, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "distill_loss": 0.1557421088218689, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "ref_ce_loss": 0.08415982872247696, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "loss": 0.3706178665161133, + "step": 20960 + }, + { + "ce_loss": 0.046508148312568665, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "distill_loss": 0.1922816038131714, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "ref_ce_loss": 0.059828322380781174, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "loss": 0.2592690587043762, + "step": 20960 + }, + { + "ce_loss": 0.043664492666721344, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "distill_loss": 0.16917598247528076, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "ref_ce_loss": 0.044888705015182495, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "loss": 0.3581051826477051, + "step": 20960 + }, + { + "ce_loss": 0.02837333455681801, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "distill_loss": 0.11634084582328796, + "epoch": 6.991327551701134, + "step": 20960 + }, + { + "epoch": 6.991327551701134, + "ref_ce_loss": 0.04302706569433212, + "step": 20960 + }, + { + "epoch": 6.994663108739159, + "loss": 0.404, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "grad_norm": 3.7893307209014893, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "learning_rate": 1.2253918716440574e-05, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "loss": 0.3417932689189911, + "step": 20970 + }, + { + "ce_loss": 0.00941214244812727, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "distill_loss": 0.16308821737766266, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "ref_ce_loss": 0.05768497660756111, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "loss": 0.33781731128692627, + "step": 20970 + }, + { + "ce_loss": 0.025083765387535095, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "distill_loss": 0.2208465039730072, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "ref_ce_loss": 0.04458913579583168, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "loss": 0.30446338653564453, + "step": 20970 + }, + { + "ce_loss": 0.04164633899927139, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "distill_loss": 0.18755929172039032, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "ref_ce_loss": 0.04275144264101982, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "loss": 0.49016839265823364, + "step": 20970 + }, + { + "ce_loss": 0.03269338980317116, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "distill_loss": 0.33109739422798157, + "epoch": 6.994663108739159, + "step": 20970 + }, + { + "epoch": 6.994663108739159, + "ref_ce_loss": 0.07132968306541443, + "step": 20970 + }, + { + "epoch": 6.997998665777184, + "loss": 0.3758, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "grad_norm": 3.27852725982666, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "learning_rate": 1.2173856650418445e-05, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "loss": 0.5222610235214233, + "step": 20980 + }, + { + "ce_loss": 0.050382375717163086, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "distill_loss": 0.32159072160720825, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "ref_ce_loss": 0.11399988830089569, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "loss": 0.22158987820148468, + "step": 20980 + }, + { + "ce_loss": 0.0016337481793016195, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "distill_loss": 0.17573025822639465, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "ref_ce_loss": 0.0440516471862793, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "loss": 0.3978367745876312, + "step": 20980 + }, + { + "ce_loss": 0.08592233061790466, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "distill_loss": 0.2198159098625183, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "ref_ce_loss": 0.0556352324783802, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "loss": 0.3263167440891266, + "step": 20980 + }, + { + "ce_loss": 0.04236403852701187, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "distill_loss": 0.1966823935508728, + "epoch": 6.997998665777184, + "step": 20980 + }, + { + "epoch": 6.997998665777184, + "ref_ce_loss": 0.04797091335058212, + "step": 20980 + }, + { + "epoch": 7.00133422281521, + "loss": 0.3182, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "grad_norm": 4.379688262939453, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "learning_rate": 1.2094045924913798e-05, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "loss": 0.28644007444381714, + "step": 20990 + }, + { + "ce_loss": 0.01825113408267498, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "distill_loss": 0.19746197760105133, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "ref_ce_loss": 0.07044369727373123, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "loss": 0.21840335428714752, + "step": 20990 + }, + { + "ce_loss": 0.020626522600650787, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "distill_loss": 0.14074021577835083, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "ref_ce_loss": 0.035429518669843674, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "loss": 0.8605027198791504, + "step": 20990 + }, + { + "ce_loss": 0.058410659432411194, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "distill_loss": 0.3554646968841553, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "ref_ce_loss": 0.07566055655479431, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "loss": 0.2325642704963684, + "step": 20990 + }, + { + "ce_loss": 0.04752802848815918, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "distill_loss": 0.13366730511188507, + "epoch": 7.00133422281521, + "step": 20990 + }, + { + "epoch": 7.00133422281521, + "ref_ce_loss": 0.05121288821101189, + "step": 20990 + }, + { + "epoch": 7.004669779853235, + "loss": 0.3769, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "grad_norm": 4.163489818572998, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "learning_rate": 1.2014486685469959e-05, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "loss": 0.25092020630836487, + "step": 21000 + }, + { + "ce_loss": 0.03144634887576103, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "distill_loss": 0.16729390621185303, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "ref_ce_loss": 0.039170920848846436, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "loss": 0.18306779861450195, + "step": 21000 + }, + { + "ce_loss": 0.004771217238157988, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "distill_loss": 0.12938238680362701, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "ref_ce_loss": 0.04884642735123634, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "loss": 0.25768306851387024, + "step": 21000 + }, + { + "ce_loss": 0.011849144473671913, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "distill_loss": 0.14590494334697723, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "ref_ce_loss": 0.05942894518375397, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "loss": 0.18659117817878723, + "step": 21000 + }, + { + "ce_loss": 0.004362636711448431, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "distill_loss": 0.15117177367210388, + "epoch": 7.004669779853235, + "step": 21000 + }, + { + "epoch": 7.004669779853235, + "ref_ce_loss": 0.030531061813235283, + "step": 21000 + }, + { + "epoch": 7.0080053368912605, + "loss": 0.3495, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "grad_norm": 2.965975761413574, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "learning_rate": 1.1935179077171525e-05, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "loss": 0.36496981978416443, + "step": 21010 + }, + { + "ce_loss": 0.027796225622296333, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "distill_loss": 0.28838932514190674, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "ref_ce_loss": 0.034264348447322845, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "loss": 0.8337552547454834, + "step": 21010 + }, + { + "ce_loss": 0.011524084955453873, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "distill_loss": 0.22686733305454254, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "ref_ce_loss": 0.05295266583561897, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "loss": 0.2895921468734741, + "step": 21010 + }, + { + "ce_loss": 0.038470905274152756, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "distill_loss": 0.16852986812591553, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "ref_ce_loss": 0.060499511659145355, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "loss": 0.3831683099269867, + "step": 21010 + }, + { + "ce_loss": 0.007674395106732845, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "distill_loss": 0.31148290634155273, + "epoch": 7.0080053368912605, + "step": 21010 + }, + { + "epoch": 7.0080053368912605, + "ref_ce_loss": 0.05108707398176193, + "step": 21010 + }, + { + "epoch": 7.011340893929286, + "loss": 0.358, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "grad_norm": 4.229156494140625, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "learning_rate": 1.1856123244644355e-05, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "loss": 0.3345952332019806, + "step": 21020 + }, + { + "ce_loss": 0.02739858441054821, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "distill_loss": 0.15175358951091766, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "ref_ce_loss": 0.04447855055332184, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "loss": 0.46418091654777527, + "step": 21020 + }, + { + "ce_loss": 0.08415623754262924, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "distill_loss": 0.26705658435821533, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "ref_ce_loss": 0.06111234799027443, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "loss": 0.3672318756580353, + "step": 21020 + }, + { + "ce_loss": 0.06163648143410683, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "distill_loss": 0.17714300751686096, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "ref_ce_loss": 0.04045576602220535, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "loss": 0.2918590009212494, + "step": 21020 + }, + { + "ce_loss": 0.020832950249314308, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "distill_loss": 0.21937981247901917, + "epoch": 7.011340893929286, + "step": 21020 + }, + { + "epoch": 7.011340893929286, + "ref_ce_loss": 0.0513937808573246, + "step": 21020 + }, + { + "epoch": 7.014676450967311, + "loss": 0.3688, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "grad_norm": 3.1062397956848145, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "learning_rate": 1.1777319332055062e-05, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "loss": 0.39231839776039124, + "step": 21030 + }, + { + "ce_loss": 0.03197869285941124, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "distill_loss": 0.25150421261787415, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "ref_ce_loss": 0.05561299994587898, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "loss": 0.3744685649871826, + "step": 21030 + }, + { + "ce_loss": 0.025469930842518806, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "distill_loss": 0.1823444664478302, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "ref_ce_loss": 0.06127699092030525, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "loss": 0.1797933280467987, + "step": 21030 + }, + { + "ce_loss": 0.010841690935194492, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "distill_loss": 0.11612024158239365, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "ref_ce_loss": 0.02793746255338192, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "loss": 0.5318374633789062, + "step": 21030 + }, + { + "ce_loss": 0.026757286861538887, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "distill_loss": 0.3860778212547302, + "epoch": 7.014676450967311, + "step": 21030 + }, + { + "epoch": 7.014676450967311, + "ref_ce_loss": 0.0411611869931221, + "step": 21030 + }, + { + "epoch": 7.0180120080053365, + "loss": 0.3613, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "grad_norm": 2.4196865558624268, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "learning_rate": 1.169876748311091e-05, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "loss": 0.31978467106819153, + "step": 21040 + }, + { + "ce_loss": 0.01599363051354885, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "distill_loss": 0.18945179879665375, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "ref_ce_loss": 0.0704239159822464, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "loss": 0.24735331535339355, + "step": 21040 + }, + { + "ce_loss": 0.024469342082738876, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "distill_loss": 0.1651473045349121, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "ref_ce_loss": 0.04400210455060005, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "loss": 0.5415984988212585, + "step": 21040 + }, + { + "ce_loss": 0.037225011736154556, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "distill_loss": 0.1968650221824646, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "ref_ce_loss": 0.07533683627843857, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "loss": 0.652391254901886, + "step": 21040 + }, + { + "ce_loss": 0.013494131155312061, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "distill_loss": 0.1403832733631134, + "epoch": 7.0180120080053365, + "step": 21040 + }, + { + "epoch": 7.0180120080053365, + "ref_ce_loss": 0.05307801812887192, + "step": 21040 + }, + { + "epoch": 7.021347565043362, + "loss": 0.3395, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "grad_norm": 3.360076427459717, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "learning_rate": 1.1620467841059511e-05, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "loss": 0.2819208800792694, + "step": 21050 + }, + { + "ce_loss": 0.0377165786921978, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "distill_loss": 0.1763533353805542, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "ref_ce_loss": 0.04387078434228897, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "loss": 0.42356985807418823, + "step": 21050 + }, + { + "ce_loss": 0.02946825698018074, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "distill_loss": 0.149619922041893, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "ref_ce_loss": 0.04405827820301056, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "loss": 0.33829575777053833, + "step": 21050 + }, + { + "ce_loss": 0.03332025557756424, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "distill_loss": 0.1677481085062027, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "ref_ce_loss": 0.04332270473241806, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "loss": 0.2877388894557953, + "step": 21050 + }, + { + "ce_loss": 0.011127806268632412, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "distill_loss": 0.236330047249794, + "epoch": 7.021347565043362, + "step": 21050 + }, + { + "epoch": 7.021347565043362, + "ref_ce_loss": 0.040191177278757095, + "step": 21050 + }, + { + "epoch": 7.024683122081387, + "loss": 0.3552, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "grad_norm": 3.0807788372039795, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "learning_rate": 1.1542420548688464e-05, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "loss": 0.2960524260997772, + "step": 21060 + }, + { + "ce_loss": 0.010671052150428295, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "distill_loss": 0.1927395910024643, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "ref_ce_loss": 0.06929319351911545, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "loss": 0.3725976347923279, + "step": 21060 + }, + { + "ce_loss": 0.010630992241203785, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "distill_loss": 0.2733990252017975, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "ref_ce_loss": 0.03346521779894829, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "loss": 0.3648867607116699, + "step": 21060 + }, + { + "ce_loss": 0.014998015947639942, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "distill_loss": 0.30057311058044434, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "ref_ce_loss": 0.03862837329506874, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "loss": 0.6767865419387817, + "step": 21060 + }, + { + "ce_loss": 0.005089592654258013, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "distill_loss": 0.3199460804462433, + "epoch": 7.024683122081387, + "step": 21060 + }, + { + "epoch": 7.024683122081387, + "ref_ce_loss": 0.07371282577514648, + "step": 21060 + }, + { + "epoch": 7.028018679119413, + "loss": 0.3651, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "grad_norm": 4.383955001831055, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "learning_rate": 1.1464625748325284e-05, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "loss": 0.217289537191391, + "step": 21070 + }, + { + "ce_loss": 0.02977265790104866, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "distill_loss": 0.13479569554328918, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "ref_ce_loss": 0.052569203078746796, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "loss": 0.25972500443458557, + "step": 21070 + }, + { + "ce_loss": 0.012066180817782879, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "distill_loss": 0.16654083132743835, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "ref_ce_loss": 0.026700763031840324, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "loss": 0.31060537695884705, + "step": 21070 + }, + { + "ce_loss": 0.018852874636650085, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "distill_loss": 0.1913139373064041, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "ref_ce_loss": 0.05901574715971947, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "loss": 0.35601550340652466, + "step": 21070 + }, + { + "ce_loss": 0.05516732484102249, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "distill_loss": 0.2630683481693268, + "epoch": 7.028018679119413, + "step": 21070 + }, + { + "epoch": 7.028018679119413, + "ref_ce_loss": 0.037460535764694214, + "step": 21070 + }, + { + "epoch": 7.031354236157438, + "loss": 0.3844, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "grad_norm": 3.6851420402526855, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "learning_rate": 1.1387083581836992e-05, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "loss": 0.3004406690597534, + "step": 21080 + }, + { + "ce_loss": 0.005100657232105732, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "distill_loss": 0.2407703995704651, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "ref_ce_loss": 0.03812128305435181, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "loss": 0.2978489398956299, + "step": 21080 + }, + { + "ce_loss": 0.016226215288043022, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "distill_loss": 0.22501929104328156, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "ref_ce_loss": 0.056322839111089706, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "loss": 0.6905106902122498, + "step": 21080 + }, + { + "ce_loss": 0.024305883795022964, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "distill_loss": 0.30780029296875, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "ref_ce_loss": 0.07255929708480835, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "loss": 0.23199906945228577, + "step": 21080 + }, + { + "ce_loss": 0.04769589379429817, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "distill_loss": 0.1294175684452057, + "epoch": 7.031354236157438, + "step": 21080 + }, + { + "epoch": 7.031354236157438, + "ref_ce_loss": 0.040796466171741486, + "step": 21080 + }, + { + "epoch": 7.034689793195463, + "loss": 0.3744, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "grad_norm": 2.6544904708862305, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "learning_rate": 1.1309794190629906e-05, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "loss": 0.26282796263694763, + "step": 21090 + }, + { + "ce_loss": 0.015210578218102455, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "distill_loss": 0.20315514504909515, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "ref_ce_loss": 0.044133421033620834, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "loss": 0.48627135157585144, + "step": 21090 + }, + { + "ce_loss": 0.04514510929584503, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "distill_loss": 0.26357829570770264, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "ref_ce_loss": 0.0717320367693901, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "loss": 0.3367202877998352, + "step": 21090 + }, + { + "ce_loss": 0.05366295948624611, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "distill_loss": 0.20038925111293793, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "ref_ce_loss": 0.08253741264343262, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "loss": 0.39167726039886475, + "step": 21090 + }, + { + "ce_loss": 0.01120414212346077, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "distill_loss": 0.1971331387758255, + "epoch": 7.034689793195463, + "step": 21090 + }, + { + "epoch": 7.034689793195463, + "ref_ce_loss": 0.03756158798933029, + "step": 21090 + }, + { + "epoch": 7.038025350233489, + "loss": 0.3475, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "grad_norm": 2.5539391040802, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "learning_rate": 1.1232757715649432e-05, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "loss": 0.4853425920009613, + "step": 21100 + }, + { + "ce_loss": 0.04947700724005699, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "distill_loss": 0.3233245611190796, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "ref_ce_loss": 0.05600043758749962, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "loss": 0.363929808139801, + "step": 21100 + }, + { + "ce_loss": 0.10695886611938477, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "distill_loss": 0.1929464340209961, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "ref_ce_loss": 0.06395082175731659, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "loss": 0.22964845597743988, + "step": 21100 + }, + { + "ce_loss": 0.00565209798514843, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "distill_loss": 0.1298438310623169, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "ref_ce_loss": 0.027368027716875076, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "loss": 0.5313661098480225, + "step": 21100 + }, + { + "ce_loss": 0.03788033872842789, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "distill_loss": 0.13044065237045288, + "epoch": 7.038025350233489, + "step": 21100 + }, + { + "epoch": 7.038025350233489, + "ref_ce_loss": 0.03457934409379959, + "step": 21100 + }, + { + "epoch": 7.041360907271514, + "loss": 0.3571, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "grad_norm": 2.350095748901367, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "learning_rate": 1.1155974297379644e-05, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "loss": 0.3459613025188446, + "step": 21110 + }, + { + "ce_loss": 0.00866713747382164, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "distill_loss": 0.20660671591758728, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "ref_ce_loss": 0.044547878205776215, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "loss": 0.23378105461597443, + "step": 21110 + }, + { + "ce_loss": 0.01803533174097538, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "distill_loss": 0.14041326940059662, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "ref_ce_loss": 0.050297707319259644, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "loss": 0.2744632363319397, + "step": 21110 + }, + { + "ce_loss": 0.026918690651655197, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "distill_loss": 0.10023162513971329, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "ref_ce_loss": 0.038829028606414795, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "loss": 0.5327430367469788, + "step": 21110 + }, + { + "ce_loss": 0.092887282371521, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "distill_loss": 0.17936775088310242, + "epoch": 7.041360907271514, + "step": 21110 + }, + { + "epoch": 7.041360907271514, + "ref_ce_loss": 0.06346702575683594, + "step": 21110 + }, + { + "epoch": 7.044696464309539, + "loss": 0.3346, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "grad_norm": 3.586826801300049, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "learning_rate": 1.1079444075843252e-05, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "loss": 0.4072696268558502, + "step": 21120 + }, + { + "ce_loss": 0.07858144491910934, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "distill_loss": 0.2500748932361603, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "ref_ce_loss": 0.05024728551506996, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "loss": 0.38662242889404297, + "step": 21120 + }, + { + "ce_loss": 0.1051878109574318, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "distill_loss": 0.19055181741714478, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "ref_ce_loss": 0.06847601383924484, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "loss": 0.37392425537109375, + "step": 21120 + }, + { + "ce_loss": 0.04408292844891548, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "distill_loss": 0.23299375176429749, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "ref_ce_loss": 0.061505965888500214, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "loss": 0.6570822596549988, + "step": 21120 + }, + { + "ce_loss": 0.07716977596282959, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "distill_loss": 0.32018449902534485, + "epoch": 7.044696464309539, + "step": 21120 + }, + { + "epoch": 7.044696464309539, + "ref_ce_loss": 0.09262377768754959, + "step": 21120 + }, + { + "epoch": 7.048032021347565, + "loss": 0.3679, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "grad_norm": 3.6819005012512207, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "learning_rate": 1.1003167190601153e-05, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "loss": 0.5182002186775208, + "step": 21130 + }, + { + "ce_loss": 0.019167399033904076, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "distill_loss": 0.3869504928588867, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "ref_ce_loss": 0.07341236621141434, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "loss": 0.2740857005119324, + "step": 21130 + }, + { + "ce_loss": 0.017344314604997635, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "distill_loss": 0.19354546070098877, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "ref_ce_loss": 0.06311025470495224, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "loss": 0.21160180866718292, + "step": 21130 + }, + { + "ce_loss": 0.042595986276865005, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "distill_loss": 0.12442618608474731, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "ref_ce_loss": 0.0371774397790432, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "loss": 0.20767563581466675, + "step": 21130 + }, + { + "ce_loss": 0.016156502068042755, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "distill_loss": 0.14854544401168823, + "epoch": 7.048032021347565, + "step": 21130 + }, + { + "epoch": 7.048032021347565, + "ref_ce_loss": 0.04287472739815712, + "step": 21130 + }, + { + "epoch": 7.05136757838559, + "loss": 0.3631, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "grad_norm": 2.9166581630706787, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "learning_rate": 1.0927143780752345e-05, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "loss": 0.32964104413986206, + "step": 21140 + }, + { + "ce_loss": 0.05006510391831398, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "distill_loss": 0.21244673430919647, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "ref_ce_loss": 0.0668855607509613, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "loss": 0.22159138321876526, + "step": 21140 + }, + { + "ce_loss": 0.03067399002611637, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "distill_loss": 0.10590535402297974, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "ref_ce_loss": 0.05489220470190048, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "loss": 0.6037322282791138, + "step": 21140 + }, + { + "ce_loss": 0.011360462754964828, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "distill_loss": 0.43363645672798157, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "ref_ce_loss": 0.06793065369129181, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "loss": 0.2623229920864105, + "step": 21140 + }, + { + "ce_loss": 0.014065076597034931, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "distill_loss": 0.19880500435829163, + "epoch": 7.05136757838559, + "step": 21140 + }, + { + "epoch": 7.05136757838559, + "ref_ce_loss": 0.04935702309012413, + "step": 21140 + }, + { + "epoch": 7.054703135423615, + "loss": 0.3285, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "grad_norm": 2.9054787158966064, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "learning_rate": 1.0851373984933532e-05, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "loss": 0.2806137204170227, + "step": 21150 + }, + { + "ce_loss": 0.03232104331254959, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "distill_loss": 0.15062671899795532, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "ref_ce_loss": 0.03469966724514961, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "loss": 0.4322219789028168, + "step": 21150 + }, + { + "ce_loss": 0.06931765377521515, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "distill_loss": 0.26579010486602783, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "ref_ce_loss": 0.04841863363981247, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "loss": 0.2573651075363159, + "step": 21150 + }, + { + "ce_loss": 0.004316686186939478, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "distill_loss": 0.1615641862154007, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "ref_ce_loss": 0.04651380330324173, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "loss": 0.25774532556533813, + "step": 21150 + }, + { + "ce_loss": 0.004958820529282093, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "distill_loss": 0.16653427481651306, + "epoch": 7.054703135423615, + "step": 21150 + }, + { + "epoch": 7.054703135423615, + "ref_ce_loss": 0.046766992658376694, + "step": 21150 + }, + { + "epoch": 7.058038692461641, + "loss": 0.345, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "grad_norm": 2.805272340774536, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "learning_rate": 1.0775857941318899e-05, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "loss": 0.3288522958755493, + "step": 21160 + }, + { + "ce_loss": 0.0010662720305845141, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "distill_loss": 0.252332866191864, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "ref_ce_loss": 0.04569484665989876, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "loss": 0.3757239580154419, + "step": 21160 + }, + { + "ce_loss": 0.003635851666331291, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "distill_loss": 0.11291775107383728, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "ref_ce_loss": 0.06550167500972748, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "loss": 0.2398977279663086, + "step": 21160 + }, + { + "ce_loss": 0.0565723218023777, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "distill_loss": 0.11294666677713394, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "ref_ce_loss": 0.05807725712656975, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "loss": 0.2866376042366028, + "step": 21160 + }, + { + "ce_loss": 0.017606956884264946, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "distill_loss": 0.18055574595928192, + "epoch": 7.058038692461641, + "step": 21160 + }, + { + "epoch": 7.058038692461641, + "ref_ce_loss": 0.046308476477861404, + "step": 21160 + }, + { + "epoch": 7.061374249499666, + "loss": 0.3608, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "grad_norm": 2.8814284801483154, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "learning_rate": 1.0700595787619925e-05, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "loss": 0.2647363543510437, + "step": 21170 + }, + { + "ce_loss": 0.01880454272031784, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "distill_loss": 0.16595271229743958, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "ref_ce_loss": 0.06280763447284698, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "loss": 0.26586365699768066, + "step": 21170 + }, + { + "ce_loss": 0.0166045892983675, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "distill_loss": 0.18876267969608307, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "ref_ce_loss": 0.05875850096344948, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "loss": 0.3638674318790436, + "step": 21170 + }, + { + "ce_loss": 0.026428092271089554, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "distill_loss": 0.20158889889717102, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "ref_ce_loss": 0.03497897461056709, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "loss": 0.2749888002872467, + "step": 21170 + }, + { + "ce_loss": 0.0217903982847929, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "distill_loss": 0.19239500164985657, + "epoch": 7.061374249499666, + "step": 21170 + }, + { + "epoch": 7.061374249499666, + "ref_ce_loss": 0.04525717347860336, + "step": 21170 + }, + { + "epoch": 7.064709806537691, + "loss": 0.3577, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "grad_norm": 4.790896415710449, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "learning_rate": 1.0625587661085105e-05, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "loss": 0.21709632873535156, + "step": 21180 + }, + { + "ce_loss": 0.01828574389219284, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "distill_loss": 0.14977601170539856, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "ref_ce_loss": 0.04868828505277634, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "loss": 0.3032727837562561, + "step": 21180 + }, + { + "ce_loss": 0.023485232144594193, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "distill_loss": 0.19155225157737732, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "ref_ce_loss": 0.05645429342985153, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "loss": 0.39241763949394226, + "step": 21180 + }, + { + "ce_loss": 0.047795649617910385, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "distill_loss": 0.2212338000535965, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "ref_ce_loss": 0.06751947104930878, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "loss": 0.25130558013916016, + "step": 21180 + }, + { + "ce_loss": 0.003252769820392132, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "distill_loss": 0.1279112547636032, + "epoch": 7.064709806537691, + "step": 21180 + }, + { + "epoch": 7.064709806537691, + "ref_ce_loss": 0.027870751917362213, + "step": 21180 + }, + { + "epoch": 7.068045363575717, + "loss": 0.3106, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "grad_norm": 3.6147572994232178, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "learning_rate": 1.0550833698499688e-05, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "loss": 0.3733055591583252, + "step": 21190 + }, + { + "ce_loss": 0.06108830124139786, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "distill_loss": 0.21531425416469574, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "ref_ce_loss": 0.06599297374486923, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "loss": 0.37621110677719116, + "step": 21190 + }, + { + "ce_loss": 0.048682309687137604, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "distill_loss": 0.2646140456199646, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "ref_ce_loss": 0.05080604925751686, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "loss": 0.43180280923843384, + "step": 21190 + }, + { + "ce_loss": 0.02251204289495945, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "distill_loss": 0.1456415057182312, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "ref_ce_loss": 0.027025721967220306, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "loss": 0.40786445140838623, + "step": 21190 + }, + { + "ce_loss": 0.06046026200056076, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "distill_loss": 0.22351795434951782, + "epoch": 7.068045363575717, + "step": 21190 + }, + { + "epoch": 7.068045363575717, + "ref_ce_loss": 0.03806782141327858, + "step": 21190 + }, + { + "epoch": 7.071380920613742, + "loss": 0.3688, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "grad_norm": 2.596191883087158, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "learning_rate": 1.0476334036185413e-05, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "loss": 0.26539090275764465, + "step": 21200 + }, + { + "ce_loss": 0.06284356117248535, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "distill_loss": 0.14580541849136353, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "ref_ce_loss": 0.037337347865104675, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "loss": 0.3034488558769226, + "step": 21200 + }, + { + "ce_loss": 0.016640178859233856, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "distill_loss": 0.2183038890361786, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "ref_ce_loss": 0.068434938788414, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "loss": 0.2993912398815155, + "step": 21200 + }, + { + "ce_loss": 0.02914571389555931, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "distill_loss": 0.2163100391626358, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "ref_ce_loss": 0.03776365891098976, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "loss": 0.312997043132782, + "step": 21200 + }, + { + "ce_loss": 0.01290181651711464, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "distill_loss": 0.20035392045974731, + "epoch": 7.071380920613742, + "step": 21200 + }, + { + "epoch": 7.071380920613742, + "ref_ce_loss": 0.060398805886507034, + "step": 21200 + }, + { + "epoch": 7.0747164776517675, + "loss": 0.3435, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "grad_norm": 2.9420666694641113, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "learning_rate": 1.0402088810000237e-05, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "loss": 0.1917342245578766, + "step": 21210 + }, + { + "ce_loss": 0.007838577963411808, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "distill_loss": 0.1391885131597519, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "ref_ce_loss": 0.026517007499933243, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "loss": 0.2319009006023407, + "step": 21210 + }, + { + "ce_loss": 0.013528701849281788, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "distill_loss": 0.16961170732975006, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "ref_ce_loss": 0.04853387549519539, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "loss": 0.4764244556427002, + "step": 21210 + }, + { + "ce_loss": 0.020892919972538948, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "distill_loss": 0.2465275526046753, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "ref_ce_loss": 0.05295209959149361, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "loss": 0.25967442989349365, + "step": 21210 + }, + { + "ce_loss": 0.0026893108151853085, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "distill_loss": 0.13870014250278473, + "epoch": 7.0747164776517675, + "step": 21210 + }, + { + "epoch": 7.0747164776517675, + "ref_ce_loss": 0.04126835614442825, + "step": 21210 + }, + { + "epoch": 7.078052034689793, + "loss": 0.3146, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "grad_norm": 3.195159435272217, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "learning_rate": 1.0328098155338189e-05, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "loss": 0.3102601170539856, + "step": 21220 + }, + { + "ce_loss": 0.02565469965338707, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "distill_loss": 0.16854459047317505, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "ref_ce_loss": 0.07197984308004379, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "loss": 0.3003503084182739, + "step": 21220 + }, + { + "ce_loss": 0.08204754441976547, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "distill_loss": 0.17015287280082703, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "ref_ce_loss": 0.047961872071027756, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "loss": 0.24385954439640045, + "step": 21220 + }, + { + "ce_loss": 0.03164820745587349, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "distill_loss": 0.1497935950756073, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "ref_ce_loss": 0.04439567029476166, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "loss": 0.29692065715789795, + "step": 21220 + }, + { + "ce_loss": 0.025501569733023643, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "distill_loss": 0.19459682703018188, + "epoch": 7.078052034689793, + "step": 21220 + }, + { + "epoch": 7.078052034689793, + "ref_ce_loss": 0.05867702141404152, + "step": 21220 + }, + { + "epoch": 7.081387591727818, + "loss": 0.3179, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "grad_norm": 3.1232728958129883, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "learning_rate": 1.0254362207129035e-05, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "loss": 0.2356388121843338, + "step": 21230 + }, + { + "ce_loss": 0.035304874181747437, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "distill_loss": 0.136562317609787, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "ref_ce_loss": 0.06360061466693878, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "loss": 0.3611792027950287, + "step": 21230 + }, + { + "ce_loss": 0.04316425696015358, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "distill_loss": 0.14863713085651398, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "ref_ce_loss": 0.031219588592648506, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "loss": 0.27983278036117554, + "step": 21230 + }, + { + "ce_loss": 0.026451464742422104, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "distill_loss": 0.20756125450134277, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "ref_ce_loss": 0.04571819305419922, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "loss": 0.3777647912502289, + "step": 21230 + }, + { + "ce_loss": 0.009428311139345169, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "distill_loss": 0.21531248092651367, + "epoch": 7.081387591727818, + "step": 21230 + }, + { + "epoch": 7.081387591727818, + "ref_ce_loss": 0.040740445256233215, + "step": 21230 + }, + { + "epoch": 7.0847231487658435, + "loss": 0.3516, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "grad_norm": 4.481757164001465, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "learning_rate": 1.0180881099838067e-05, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "loss": 0.24451516568660736, + "step": 21240 + }, + { + "ce_loss": 0.016348013654351234, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "distill_loss": 0.194583460688591, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "ref_ce_loss": 0.0334811732172966, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "loss": 0.20874151587486267, + "step": 21240 + }, + { + "ce_loss": 0.03042174130678177, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "distill_loss": 0.13220363855361938, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "ref_ce_loss": 0.03144841641187668, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "loss": 0.4618191719055176, + "step": 21240 + }, + { + "ce_loss": 0.04640379920601845, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "distill_loss": 0.2875906229019165, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "ref_ce_loss": 0.08858513832092285, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "loss": 0.25335830450057983, + "step": 21240 + }, + { + "ce_loss": 0.014944043010473251, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "distill_loss": 0.16305804252624512, + "epoch": 7.0847231487658435, + "step": 21240 + }, + { + "epoch": 7.0847231487658435, + "ref_ce_loss": 0.04924383386969566, + "step": 21240 + }, + { + "epoch": 7.088058705803869, + "loss": 0.3344, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "grad_norm": 6.538269996643066, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "learning_rate": 1.0107654967465844e-05, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "loss": 0.35251274704933167, + "step": 21250 + }, + { + "ce_loss": 0.01282446552067995, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "distill_loss": 0.16358213126659393, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "ref_ce_loss": 0.04842758923768997, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "loss": 0.15011943876743317, + "step": 21250 + }, + { + "ce_loss": 0.00510772131383419, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "distill_loss": 0.1163831353187561, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "ref_ce_loss": 0.028515568003058434, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "loss": 0.2913273274898529, + "step": 21250 + }, + { + "ce_loss": 0.04327051341533661, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "distill_loss": 0.20849546790122986, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "ref_ce_loss": 0.03915373235940933, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "loss": 0.3624514937400818, + "step": 21250 + }, + { + "ce_loss": 0.054993703961372375, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "distill_loss": 0.22235330939292908, + "epoch": 7.088058705803869, + "step": 21250 + }, + { + "epoch": 7.088058705803869, + "ref_ce_loss": 0.059161581099033356, + "step": 21250 + }, + { + "epoch": 7.091394262841894, + "loss": 0.3363, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "grad_norm": 2.7685844898223877, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "learning_rate": 1.0034683943547916e-05, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "loss": 0.2722393870353699, + "step": 21260 + }, + { + "ce_loss": 0.01598196104168892, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "distill_loss": 0.19478975236415863, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "ref_ce_loss": 0.044273532927036285, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "loss": 0.5252133011817932, + "step": 21260 + }, + { + "ce_loss": 0.013084584847092628, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "distill_loss": 0.2927495837211609, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "ref_ce_loss": 0.0654245987534523, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "loss": 0.3494012951850891, + "step": 21260 + }, + { + "ce_loss": 0.030542615801095963, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "distill_loss": 0.21779635548591614, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "ref_ce_loss": 0.04543349891901016, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "loss": 0.2055548131465912, + "step": 21260 + }, + { + "ce_loss": 0.004060425795614719, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "distill_loss": 0.16809694468975067, + "epoch": 7.091394262841894, + "step": 21260 + }, + { + "epoch": 7.091394262841894, + "ref_ce_loss": 0.021780950948596, + "step": 21260 + }, + { + "epoch": 7.09472981987992, + "loss": 0.3662, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "grad_norm": 5.921723365783691, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "learning_rate": 9.961968161154653e-06, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "loss": 0.6266645193099976, + "step": 21270 + }, + { + "ce_loss": 0.06745955348014832, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "distill_loss": 0.3753885328769684, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "ref_ce_loss": 0.07958387583494186, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "loss": 0.21966169774532318, + "step": 21270 + }, + { + "ce_loss": 0.04526279494166374, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "distill_loss": 0.15075144171714783, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "ref_ce_loss": 0.023584889248013496, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "loss": 0.2990386188030243, + "step": 21270 + }, + { + "ce_loss": 0.020705245435237885, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "distill_loss": 0.20941773056983948, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "ref_ce_loss": 0.05655130743980408, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "loss": 0.2656441628932953, + "step": 21270 + }, + { + "ce_loss": 0.01828238181769848, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "distill_loss": 0.19571369886398315, + "epoch": 7.09472981987992, + "step": 21270 + }, + { + "epoch": 7.09472981987992, + "ref_ce_loss": 0.05159137025475502, + "step": 21270 + }, + { + "epoch": 7.098065376917945, + "loss": 0.3966, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "grad_norm": 3.6227405071258545, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "learning_rate": 9.889507752891019e-06, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "loss": 0.3899279236793518, + "step": 21280 + }, + { + "ce_loss": 0.04282451048493385, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "distill_loss": 0.2592725455760956, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "ref_ce_loss": 0.058857910335063934, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "loss": 0.2215050756931305, + "step": 21280 + }, + { + "ce_loss": 0.026519374921917915, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "distill_loss": 0.16466079652309418, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "ref_ce_loss": 0.02995748445391655, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "loss": 0.3635097146034241, + "step": 21280 + }, + { + "ce_loss": 0.0383116714656353, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "distill_loss": 0.14281782507896423, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "ref_ce_loss": 0.05979880690574646, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "loss": 0.6661757230758667, + "step": 21280 + }, + { + "ce_loss": 0.008801848627626896, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "distill_loss": 0.32869842648506165, + "epoch": 7.098065376917945, + "step": 21280 + }, + { + "epoch": 7.098065376917945, + "ref_ce_loss": 0.048422083258628845, + "step": 21280 + }, + { + "epoch": 7.10140093395597, + "loss": 0.3354, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "grad_norm": 3.7059237957000732, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "learning_rate": 9.817302850896092e-06, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "loss": 0.7997819185256958, + "step": 21290 + }, + { + "ce_loss": 0.057272523641586304, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "distill_loss": 0.3218955993652344, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "ref_ce_loss": 0.06255380809307098, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "loss": 0.7139435410499573, + "step": 21290 + }, + { + "ce_loss": 0.084070585668087, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "distill_loss": 0.23481443524360657, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "ref_ce_loss": 0.07495547086000443, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "loss": 0.35811299085617065, + "step": 21290 + }, + { + "ce_loss": 0.031879205256700516, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "distill_loss": 0.13502013683319092, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "ref_ce_loss": 0.052764344960451126, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "loss": 0.20150531828403473, + "step": 21290 + }, + { + "ce_loss": 0.00755065493285656, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "distill_loss": 0.151234433054924, + "epoch": 7.10140093395597, + "step": 21290 + }, + { + "epoch": 7.10140093395597, + "ref_ce_loss": 0.04256344586610794, + "step": 21290 + }, + { + "epoch": 7.104736490993996, + "loss": 0.3325, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "grad_norm": 3.5701239109039307, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "learning_rate": 9.74535358684323e-06, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "loss": 0.24100618064403534, + "step": 21300 + }, + { + "ce_loss": 0.026463542133569717, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "distill_loss": 0.16529029607772827, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "ref_ce_loss": 0.04909892380237579, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "loss": 0.2727445960044861, + "step": 21300 + }, + { + "ce_loss": 0.022189678624272346, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "distill_loss": 0.18847636878490448, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "ref_ce_loss": 0.04147105664014816, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "loss": 0.2807205021381378, + "step": 21300 + }, + { + "ce_loss": 0.0023678650613874197, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "distill_loss": 0.19518640637397766, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "ref_ce_loss": 0.06072551757097244, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "loss": 0.26328280568122864, + "step": 21300 + }, + { + "ce_loss": 0.03507067635655403, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "distill_loss": 0.15676164627075195, + "epoch": 7.104736490993996, + "step": 21300 + }, + { + "epoch": 7.104736490993996, + "ref_ce_loss": 0.041791219264268875, + "step": 21300 + }, + { + "epoch": 7.108072048032021, + "loss": 0.3332, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "grad_norm": 3.480285167694092, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "learning_rate": 9.673660091939512e-06, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "loss": 0.2976704239845276, + "step": 21310 + }, + { + "ce_loss": 0.03696581348776817, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "distill_loss": 0.20639725029468536, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "ref_ce_loss": 0.038472164422273636, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "loss": 0.4582396149635315, + "step": 21310 + }, + { + "ce_loss": 0.032093822956085205, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "distill_loss": 0.20050117373466492, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "ref_ce_loss": 0.05842212215065956, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "loss": 0.24652057886123657, + "step": 21310 + }, + { + "ce_loss": 0.032380156219005585, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "distill_loss": 0.16446007788181305, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "ref_ce_loss": 0.04951951652765274, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "loss": 0.4571993947029114, + "step": 21310 + }, + { + "ce_loss": 0.00208325800485909, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "distill_loss": 0.2922826409339905, + "epoch": 7.108072048032021, + "step": 21310 + }, + { + "epoch": 7.108072048032021, + "ref_ce_loss": 0.044417742639780045, + "step": 21310 + }, + { + "epoch": 7.111407605070046, + "loss": 0.3539, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "grad_norm": 3.551114082336426, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "learning_rate": 9.602222496925537e-06, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "loss": 0.24439047276973724, + "step": 21320 + }, + { + "ce_loss": 0.010866631753742695, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "distill_loss": 0.17475149035453796, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "ref_ce_loss": 0.029197681695222855, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "loss": 0.17844556272029877, + "step": 21320 + }, + { + "ce_loss": 0.0042810384184122086, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "distill_loss": 0.14426660537719727, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "ref_ce_loss": 0.029594624415040016, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "loss": 0.2958901524543762, + "step": 21320 + }, + { + "ce_loss": 0.011121377348899841, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "distill_loss": 0.16819752752780914, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "ref_ce_loss": 0.041761115193367004, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "loss": 0.3758298456668854, + "step": 21320 + }, + { + "ce_loss": 0.014851349405944347, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "distill_loss": 0.26843661069869995, + "epoch": 7.111407605070046, + "step": 21320 + }, + { + "epoch": 7.111407605070046, + "ref_ce_loss": 0.06300424784421921, + "step": 21320 + }, + { + "epoch": 7.114743162108072, + "loss": 0.3297, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "grad_norm": 2.231074571609497, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "learning_rate": 9.531040932075352e-06, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "loss": 0.1900641918182373, + "step": 21330 + }, + { + "ce_loss": 0.0005149010685272515, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "distill_loss": 0.11143720895051956, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "ref_ce_loss": 0.029446803033351898, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "loss": 0.3922143876552582, + "step": 21330 + }, + { + "ce_loss": 0.016270199790596962, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "distill_loss": 0.24628299474716187, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "ref_ce_loss": 0.04830366000533104, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "loss": 0.49055543541908264, + "step": 21330 + }, + { + "ce_loss": 0.04130590334534645, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "distill_loss": 0.3618186116218567, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "ref_ce_loss": 0.06838495284318924, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "loss": 0.20764322578907013, + "step": 21330 + }, + { + "ce_loss": 0.005058347247540951, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "distill_loss": 0.14182953536510468, + "epoch": 7.114743162108072, + "step": 21330 + }, + { + "epoch": 7.114743162108072, + "ref_ce_loss": 0.04315049946308136, + "step": 21330 + }, + { + "epoch": 7.118078719146097, + "loss": 0.3255, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "grad_norm": 3.8594162464141846, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "learning_rate": 9.460115527195999e-06, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "loss": 0.7007927894592285, + "step": 21340 + }, + { + "ce_loss": 0.07202707231044769, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "distill_loss": 0.16455012559890747, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "ref_ce_loss": 0.06058318912982941, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "loss": 0.22472164034843445, + "step": 21340 + }, + { + "ce_loss": 0.013528671115636826, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "distill_loss": 0.1718527376651764, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "ref_ce_loss": 0.0391010157763958, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "loss": 0.22745491564273834, + "step": 21340 + }, + { + "ce_loss": 0.02853975258767605, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "distill_loss": 0.1669045090675354, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "ref_ce_loss": 0.03188779205083847, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "loss": 0.30058276653289795, + "step": 21340 + }, + { + "ce_loss": 0.05709119886159897, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "distill_loss": 0.18364207446575165, + "epoch": 7.118078719146097, + "step": 21340 + }, + { + "epoch": 7.118078719146097, + "ref_ce_loss": 0.044599033892154694, + "step": 21340 + }, + { + "epoch": 7.121414276184122, + "loss": 0.2999, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "grad_norm": 2.430203437805176, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "learning_rate": 9.389446411627439e-06, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "loss": 0.4514169991016388, + "step": 21350 + }, + { + "ce_loss": 0.04752679169178009, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "distill_loss": 0.13268008828163147, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "ref_ce_loss": 0.04275263473391533, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "loss": 0.37929394841194153, + "step": 21350 + }, + { + "ce_loss": 0.045660898089408875, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "distill_loss": 0.19815610349178314, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "ref_ce_loss": 0.05834659934043884, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "loss": 0.248044952750206, + "step": 21350 + }, + { + "ce_loss": 0.01318468526005745, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "distill_loss": 0.16702339053153992, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "ref_ce_loss": 0.0383535698056221, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "loss": 0.23754586279392242, + "step": 21350 + }, + { + "ce_loss": 0.015385741367936134, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "distill_loss": 0.16478316485881805, + "epoch": 7.121414276184122, + "step": 21350 + }, + { + "epoch": 7.121414276184122, + "ref_ce_loss": 0.03582388535141945, + "step": 21350 + }, + { + "epoch": 7.124749833222148, + "loss": 0.3619, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "grad_norm": 4.065296173095703, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "learning_rate": 9.319033714242347e-06, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "loss": 0.2339756339788437, + "step": 21360 + }, + { + "ce_loss": 0.014882135204970837, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "distill_loss": 0.16815043985843658, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "ref_ce_loss": 0.03089192323386669, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "loss": 0.4349558353424072, + "step": 21360 + }, + { + "ce_loss": 0.026239456608891487, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "distill_loss": 0.2093418538570404, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "ref_ce_loss": 0.07308489829301834, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "loss": 0.313965380191803, + "step": 21360 + }, + { + "ce_loss": 0.031080765649676323, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "distill_loss": 0.19044944643974304, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "ref_ce_loss": 0.050029199570417404, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "loss": 0.2853374183177948, + "step": 21360 + }, + { + "ce_loss": 0.01889471895992756, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "distill_loss": 0.14720062911510468, + "epoch": 7.124749833222148, + "step": 21360 + }, + { + "epoch": 7.124749833222148, + "ref_ce_loss": 0.044718701392412186, + "step": 21360 + }, + { + "epoch": 7.128085390260173, + "loss": 0.3224, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "grad_norm": 2.7422754764556885, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "learning_rate": 9.248877563445611e-06, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "loss": 0.3476315140724182, + "step": 21370 + }, + { + "ce_loss": 0.06209288164973259, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "distill_loss": 0.2215966433286667, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "ref_ce_loss": 0.04940538853406906, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "loss": 0.30568182468414307, + "step": 21370 + }, + { + "ce_loss": 0.06676947325468063, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "distill_loss": 0.13683325052261353, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "ref_ce_loss": 0.0813133716583252, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "loss": 0.2064388394355774, + "step": 21370 + }, + { + "ce_loss": 0.024937504902482033, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "distill_loss": 0.09930000454187393, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "ref_ce_loss": 0.058826789259910583, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "loss": 0.3290785849094391, + "step": 21370 + }, + { + "ce_loss": 0.02121903747320175, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "distill_loss": 0.2270878106355667, + "epoch": 7.128085390260173, + "step": 21370 + }, + { + "epoch": 7.128085390260173, + "ref_ce_loss": 0.031229302287101746, + "step": 21370 + }, + { + "epoch": 7.131420947298198, + "loss": 0.3363, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "grad_norm": 3.2990729808807373, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "learning_rate": 9.178978087174426e-06, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "loss": 0.27483171224594116, + "step": 21380 + }, + { + "ce_loss": 0.0506613627076149, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "distill_loss": 0.1558806151151657, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "ref_ce_loss": 0.04856330528855324, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "loss": 0.43359971046447754, + "step": 21380 + }, + { + "ce_loss": 0.0775969922542572, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "distill_loss": 0.26211804151535034, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "ref_ce_loss": 0.06571530550718307, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "loss": 0.3924413323402405, + "step": 21380 + }, + { + "ce_loss": 0.00827367790043354, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "distill_loss": 0.18567365407943726, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "ref_ce_loss": 0.03840828314423561, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "loss": 0.3229596018791199, + "step": 21380 + }, + { + "ce_loss": 0.030836794525384903, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "distill_loss": 0.20113810896873474, + "epoch": 7.131420947298198, + "step": 21380 + }, + { + "epoch": 7.131420947298198, + "ref_ce_loss": 0.04698263481259346, + "step": 21380 + }, + { + "epoch": 7.134756504336224, + "loss": 0.3376, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "grad_norm": 4.786271572113037, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "learning_rate": 9.109335412897845e-06, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "loss": 0.5202345252037048, + "step": 21390 + }, + { + "ce_loss": 0.05993020907044411, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "distill_loss": 0.1180260181427002, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "ref_ce_loss": 0.0760614275932312, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "loss": 0.2719026207923889, + "step": 21390 + }, + { + "ce_loss": 0.042188555002212524, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "distill_loss": 0.11038866639137268, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "ref_ce_loss": 0.04938900098204613, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "loss": 0.6805160641670227, + "step": 21390 + }, + { + "ce_loss": 0.027744436636567116, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "distill_loss": 0.17196469008922577, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "ref_ce_loss": 0.05967358127236366, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "loss": 0.33298105001449585, + "step": 21390 + }, + { + "ce_loss": 0.047904592007398605, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "distill_loss": 0.18573933839797974, + "epoch": 7.134756504336224, + "step": 21390 + }, + { + "epoch": 7.134756504336224, + "ref_ce_loss": 0.0671057403087616, + "step": 21390 + }, + { + "epoch": 7.138092061374249, + "loss": 0.3493, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "grad_norm": 5.421943664550781, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "learning_rate": 9.039949667616641e-06, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "loss": 0.48561030626296997, + "step": 21400 + }, + { + "ce_loss": 0.03090047463774681, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "distill_loss": 0.32706791162490845, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "ref_ce_loss": 0.07010676711797714, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "loss": 0.3397560119628906, + "step": 21400 + }, + { + "ce_loss": 0.03334207087755203, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "distill_loss": 0.22810399532318115, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "ref_ce_loss": 0.05727453902363777, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "loss": 0.2319808006286621, + "step": 21400 + }, + { + "ce_loss": 0.024934319779276848, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "distill_loss": 0.13915522396564484, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "ref_ce_loss": 0.03444851189851761, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "loss": 0.3142017424106598, + "step": 21400 + }, + { + "ce_loss": 0.009186441078782082, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "distill_loss": 0.220073401927948, + "epoch": 7.138092061374249, + "step": 21400 + }, + { + "epoch": 7.138092061374249, + "ref_ce_loss": 0.05729452520608902, + "step": 21400 + }, + { + "epoch": 7.1414276184122745, + "loss": 0.3489, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "grad_norm": 2.15388560295105, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "learning_rate": 8.970820977863019e-06, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "loss": 0.1508547067642212, + "step": 21410 + }, + { + "ce_loss": 0.01235622726380825, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "distill_loss": 0.09108994156122208, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "ref_ce_loss": 0.03690817207098007, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "loss": 0.5053752660751343, + "step": 21410 + }, + { + "ce_loss": 0.016127459704875946, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "distill_loss": 0.14929452538490295, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "ref_ce_loss": 0.018189487978816032, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "loss": 0.3429606258869171, + "step": 21410 + }, + { + "ce_loss": 0.017136679962277412, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "distill_loss": 0.2520356774330139, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "ref_ce_loss": 0.047066353261470795, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "loss": 0.3544640839099884, + "step": 21410 + }, + { + "ce_loss": 0.027670860290527344, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "distill_loss": 0.24994097650051117, + "epoch": 7.1414276184122745, + "step": 21410 + }, + { + "epoch": 7.1414276184122745, + "ref_ce_loss": 0.05183815583586693, + "step": 21410 + }, + { + "epoch": 7.1447631754503, + "loss": 0.3671, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "grad_norm": 44.177581787109375, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "learning_rate": 8.901949469700487e-06, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "loss": 0.460884153842926, + "step": 21420 + }, + { + "ce_loss": 0.04858725890517235, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "distill_loss": 0.17198331654071808, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "ref_ce_loss": 0.12297511100769043, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "loss": 0.40572482347488403, + "step": 21420 + }, + { + "ce_loss": 0.029643459245562553, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "distill_loss": 0.30070599913597107, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "ref_ce_loss": 0.05532967299222946, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "loss": 0.3806205689907074, + "step": 21420 + }, + { + "ce_loss": 0.02531914971768856, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "distill_loss": 0.29515233635902405, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "ref_ce_loss": 0.059911541640758514, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "loss": 0.5730224847793579, + "step": 21420 + }, + { + "ce_loss": 0.03719313442707062, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "distill_loss": 0.14535340666770935, + "epoch": 7.1447631754503, + "step": 21420 + }, + { + "epoch": 7.1447631754503, + "ref_ce_loss": 0.05887611210346222, + "step": 21420 + }, + { + "epoch": 7.148098732488325, + "loss": 0.3547, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "grad_norm": 4.740625381469727, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "learning_rate": 8.833335268723462e-06, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "loss": 0.20340901613235474, + "step": 21430 + }, + { + "ce_loss": 0.012906559742987156, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "distill_loss": 0.14369744062423706, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "ref_ce_loss": 0.04662976786494255, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "loss": 0.3858056962490082, + "step": 21430 + }, + { + "ce_loss": 0.06574510037899017, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "distill_loss": 0.2124141901731491, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "ref_ce_loss": 0.0829734280705452, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "loss": 0.3030683398246765, + "step": 21430 + }, + { + "ce_loss": 0.04919762909412384, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "distill_loss": 0.1884440779685974, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "ref_ce_loss": 0.03764420002698898, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "loss": 0.43533486127853394, + "step": 21430 + }, + { + "ce_loss": 0.02066192403435707, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "distill_loss": 0.23798935115337372, + "epoch": 7.148098732488325, + "step": 21430 + }, + { + "epoch": 7.148098732488325, + "ref_ce_loss": 0.058441367000341415, + "step": 21430 + }, + { + "epoch": 7.1514342895263505, + "loss": 0.3316, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "grad_norm": 2.6973822116851807, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "learning_rate": 8.76497850005724e-06, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "loss": 0.3422290086746216, + "step": 21440 + }, + { + "ce_loss": 0.02658429928123951, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "distill_loss": 0.2068696916103363, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "ref_ce_loss": 0.054690875113010406, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "loss": 0.38151055574417114, + "step": 21440 + }, + { + "ce_loss": 0.0338323637843132, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "distill_loss": 0.12502741813659668, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "ref_ce_loss": 0.02690565027296543, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "loss": 0.25190258026123047, + "step": 21440 + }, + { + "ce_loss": 0.009585889987647533, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "distill_loss": 0.12396883219480515, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "ref_ce_loss": 0.032859060913324356, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "loss": 0.37992632389068604, + "step": 21440 + }, + { + "ce_loss": 0.022066691890358925, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "distill_loss": 0.13286367058753967, + "epoch": 7.1514342895263505, + "step": 21440 + }, + { + "epoch": 7.1514342895263505, + "ref_ce_loss": 0.034272920340299606, + "step": 21440 + }, + { + "epoch": 7.154769846564376, + "loss": 0.3336, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "grad_norm": 6.438956260681152, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "learning_rate": 8.69687928835754e-06, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "loss": 0.1311127245426178, + "step": 21450 + }, + { + "ce_loss": 0.0023328508250415325, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "distill_loss": 0.09891074895858765, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "ref_ce_loss": 0.02963155321776867, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "loss": 0.48175233602523804, + "step": 21450 + }, + { + "ce_loss": 0.01688428781926632, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "distill_loss": 0.15957482159137726, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "ref_ce_loss": 0.023770110681653023, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "loss": 0.3389487862586975, + "step": 21450 + }, + { + "ce_loss": 0.04547916725277901, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "distill_loss": 0.2358999252319336, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "ref_ce_loss": 0.03925956413149834, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "loss": 0.37305766344070435, + "step": 21450 + }, + { + "ce_loss": 0.01338366698473692, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "distill_loss": 0.2325180470943451, + "epoch": 7.154769846564376, + "step": 21450 + }, + { + "epoch": 7.154769846564376, + "ref_ce_loss": 0.03668252378702164, + "step": 21450 + }, + { + "epoch": 7.158105403602401, + "loss": 0.3734, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "grad_norm": 3.4364991188049316, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "learning_rate": 8.629037757810486e-06, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "loss": 0.22355008125305176, + "step": 21460 + }, + { + "ce_loss": 0.016299467533826828, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "distill_loss": 0.11342824250459671, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "ref_ce_loss": 0.034232400357723236, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "loss": 0.44346651434898376, + "step": 21460 + }, + { + "ce_loss": 0.035172708332538605, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "distill_loss": 0.33047035336494446, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "ref_ce_loss": 0.06059928610920906, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "loss": 0.22185933589935303, + "step": 21460 + }, + { + "ce_loss": 0.026147043332457542, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "distill_loss": 0.1444932222366333, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "ref_ce_loss": 0.024612026289105415, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "loss": 0.26896658539772034, + "step": 21460 + }, + { + "ce_loss": 0.02288634143769741, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "distill_loss": 0.12373004108667374, + "epoch": 7.158105403602401, + "step": 21460 + }, + { + "epoch": 7.158105403602401, + "ref_ce_loss": 0.06691116094589233, + "step": 21460 + }, + { + "epoch": 7.161440960640427, + "loss": 0.3483, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "grad_norm": 2.7852957248687744, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "learning_rate": 8.561454032132253e-06, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "loss": 0.26754996180534363, + "step": 21470 + }, + { + "ce_loss": 0.03477302938699722, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "distill_loss": 0.1344570368528366, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "ref_ce_loss": 0.04726096987724304, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "loss": 0.2843259274959564, + "step": 21470 + }, + { + "ce_loss": 0.022212691605091095, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "distill_loss": 0.1965642273426056, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "ref_ce_loss": 0.033638931810855865, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "loss": 0.28304195404052734, + "step": 21470 + }, + { + "ce_loss": 0.02403201535344124, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "distill_loss": 0.154997318983078, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "ref_ce_loss": 0.038654860109090805, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "loss": 0.3013923168182373, + "step": 21470 + }, + { + "ce_loss": 0.03783771023154259, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "distill_loss": 0.1492641121149063, + "epoch": 7.161440960640427, + "step": 21470 + }, + { + "epoch": 7.161440960640427, + "ref_ce_loss": 0.07085458189249039, + "step": 21470 + }, + { + "epoch": 7.164776517678452, + "loss": 0.3155, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "grad_norm": 3.168621301651001, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "learning_rate": 8.494128234568936e-06, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "loss": 0.5053913593292236, + "step": 21480 + }, + { + "ce_loss": 0.05008067935705185, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "distill_loss": 0.19111387431621552, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "ref_ce_loss": 0.06695059686899185, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "loss": 0.4717422425746918, + "step": 21480 + }, + { + "ce_loss": 0.15805310010910034, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "distill_loss": 0.16689586639404297, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "ref_ce_loss": 0.05943808704614639, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "loss": 0.4965991973876953, + "step": 21480 + }, + { + "ce_loss": 0.026956552639603615, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "distill_loss": 0.38682329654693604, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "ref_ce_loss": 0.047263722866773605, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "loss": 0.24691656231880188, + "step": 21480 + }, + { + "ce_loss": 0.02406405471265316, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "distill_loss": 0.1757323443889618, + "epoch": 7.164776517678452, + "step": 21480 + }, + { + "epoch": 7.164776517678452, + "ref_ce_loss": 0.04702390730381012, + "step": 21480 + }, + { + "epoch": 7.168112074716477, + "loss": 0.311, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "grad_norm": 3.70613956451416, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "learning_rate": 8.427060487896209e-06, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "loss": 0.6340287923812866, + "step": 21490 + }, + { + "ce_loss": 0.026472387835383415, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "distill_loss": 0.32688456773757935, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "ref_ce_loss": 0.06099152937531471, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "loss": 0.14026260375976562, + "step": 21490 + }, + { + "ce_loss": 0.00750059774145484, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "distill_loss": 0.08525346219539642, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "ref_ce_loss": 0.032193947583436966, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "loss": 0.347702294588089, + "step": 21490 + }, + { + "ce_loss": 0.06765453517436981, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "distill_loss": 0.18379972875118256, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "ref_ce_loss": 0.059257086366415024, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "loss": 0.3984415531158447, + "step": 21490 + }, + { + "ce_loss": 0.06973186135292053, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "distill_loss": 0.27749377489089966, + "epoch": 7.168112074716477, + "step": 21490 + }, + { + "epoch": 7.168112074716477, + "ref_ce_loss": 0.050319571048021317, + "step": 21490 + }, + { + "epoch": 7.171447631754503, + "loss": 0.3219, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "grad_norm": 4.287350654602051, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "learning_rate": 8.36025091441917e-06, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "loss": 0.19232136011123657, + "step": 21500 + }, + { + "ce_loss": 0.005756628233939409, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "distill_loss": 0.11545160412788391, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "ref_ce_loss": 0.032617710530757904, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "loss": 0.3118692636489868, + "step": 21500 + }, + { + "ce_loss": 0.0021937473211437464, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "distill_loss": 0.11081088334321976, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "ref_ce_loss": 0.050306838005781174, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "loss": 0.2555490732192993, + "step": 21500 + }, + { + "ce_loss": 0.027354789897799492, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "distill_loss": 0.20176441967487335, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "ref_ce_loss": 0.026141945272684097, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "loss": 0.3627238869667053, + "step": 21500 + }, + { + "ce_loss": 0.02147168107330799, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "distill_loss": 0.22193299233913422, + "epoch": 7.171447631754503, + "step": 21500 + }, + { + "epoch": 7.171447631754503, + "ref_ce_loss": 0.04372347891330719, + "step": 21500 + }, + { + "epoch": 7.174783188792528, + "loss": 0.3478, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "grad_norm": 4.667346954345703, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "learning_rate": 8.293699635972146e-06, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "loss": 0.2823328971862793, + "step": 21510 + }, + { + "ce_loss": 0.026913125067949295, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "distill_loss": 0.18480005860328674, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "ref_ce_loss": 0.05143596976995468, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "loss": 0.2691704034805298, + "step": 21510 + }, + { + "ce_loss": 0.06230126693844795, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "distill_loss": 0.1483161896467209, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "ref_ce_loss": 0.05851830169558525, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "loss": 0.3138797879219055, + "step": 21510 + }, + { + "ce_loss": 0.04803266003727913, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "distill_loss": 0.21480245888233185, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "ref_ce_loss": 0.05064088851213455, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "loss": 0.4249471426010132, + "step": 21510 + }, + { + "ce_loss": 0.05544541776180267, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "distill_loss": 0.1516028344631195, + "epoch": 7.174783188792528, + "step": 21510 + }, + { + "epoch": 7.174783188792528, + "ref_ce_loss": 0.06550660729408264, + "step": 21510 + }, + { + "epoch": 7.178118745830553, + "loss": 0.327, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "grad_norm": 3.7791588306427, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "learning_rate": 8.227406773918405e-06, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "loss": 0.20280539989471436, + "step": 21520 + }, + { + "ce_loss": 0.006557751446962357, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "distill_loss": 0.11654480546712875, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "ref_ce_loss": 0.055110517889261246, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "loss": 0.3507617712020874, + "step": 21520 + }, + { + "ce_loss": 0.013434783555567265, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "distill_loss": 0.25564488768577576, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "ref_ce_loss": 0.041842278093099594, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "loss": 0.2878246605396271, + "step": 21520 + }, + { + "ce_loss": 0.03153041750192642, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "distill_loss": 0.18612876534461975, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "ref_ce_loss": 0.049200598150491714, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "loss": 0.33751150965690613, + "step": 21520 + }, + { + "ce_loss": 0.04821312800049782, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "distill_loss": 0.149494931101799, + "epoch": 7.178118745830553, + "step": 21520 + }, + { + "epoch": 7.178118745830553, + "ref_ce_loss": 0.06942351907491684, + "step": 21520 + }, + { + "epoch": 7.181454302868579, + "loss": 0.3396, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "grad_norm": 2.776742696762085, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "learning_rate": 8.161372449149994e-06, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "loss": 0.2622234523296356, + "step": 21530 + }, + { + "ce_loss": 0.04477609694004059, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "distill_loss": 0.1622024029493332, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "ref_ce_loss": 0.03245099261403084, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "loss": 0.5293148159980774, + "step": 21530 + }, + { + "ce_loss": 0.036752525717020035, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "distill_loss": 0.2745060920715332, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "ref_ce_loss": 0.0655098631978035, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "loss": 0.3222862482070923, + "step": 21530 + }, + { + "ce_loss": 0.014142888598144054, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "distill_loss": 0.09600169211626053, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "ref_ce_loss": 0.031071700155735016, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "loss": 0.24960437417030334, + "step": 21530 + }, + { + "ce_loss": 0.017925996333360672, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "distill_loss": 0.1352931410074234, + "epoch": 7.181454302868579, + "step": 21530 + }, + { + "epoch": 7.181454302868579, + "ref_ce_loss": 0.07950492948293686, + "step": 21530 + }, + { + "epoch": 7.184789859906604, + "loss": 0.344, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "grad_norm": 2.4601974487304688, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "learning_rate": 8.095596782087487e-06, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "loss": 0.3111080825328827, + "step": 21540 + }, + { + "ce_loss": 0.02254648320376873, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "distill_loss": 0.1943889558315277, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "ref_ce_loss": 0.05962618440389633, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "loss": 0.43967175483703613, + "step": 21540 + }, + { + "ce_loss": 0.04014963284134865, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "distill_loss": 0.1975461095571518, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "ref_ce_loss": 0.052939631044864655, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "loss": 0.20108476281166077, + "step": 21540 + }, + { + "ce_loss": 0.038109976798295975, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "distill_loss": 0.1050659641623497, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "ref_ce_loss": 0.045342423021793365, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "loss": 0.2912309765815735, + "step": 21540 + }, + { + "ce_loss": 0.011822945438325405, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "distill_loss": 0.1958990842103958, + "epoch": 7.184789859906604, + "step": 21540 + }, + { + "epoch": 7.184789859906604, + "ref_ce_loss": 0.030265500769019127, + "step": 21540 + }, + { + "epoch": 7.188125416944629, + "loss": 0.3729, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "grad_norm": 3.593996047973633, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "learning_rate": 8.030079892679702e-06, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "loss": 0.35638487339019775, + "step": 21550 + }, + { + "ce_loss": 0.009184126742184162, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "distill_loss": 0.15575531125068665, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "ref_ce_loss": 0.04846501350402832, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "loss": 0.26497361063957214, + "step": 21550 + }, + { + "ce_loss": 0.016823608428239822, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "distill_loss": 0.17821790277957916, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "ref_ce_loss": 0.05922691524028778, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "loss": 0.35575103759765625, + "step": 21550 + }, + { + "ce_loss": 0.03228254243731499, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "distill_loss": 0.23816150426864624, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "ref_ce_loss": 0.06303157657384872, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "loss": 0.32615143060684204, + "step": 21550 + }, + { + "ce_loss": 0.036773085594177246, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "distill_loss": 0.15746957063674927, + "epoch": 7.188125416944629, + "step": 21550 + }, + { + "epoch": 7.188125416944629, + "ref_ce_loss": 0.038831427693367004, + "step": 21550 + }, + { + "epoch": 7.191460973982655, + "loss": 0.3367, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "grad_norm": 6.094079971313477, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "learning_rate": 7.96482190040365e-06, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "loss": 0.46769973635673523, + "step": 21560 + }, + { + "ce_loss": 0.04756125435233116, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "distill_loss": 0.31982189416885376, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "ref_ce_loss": 0.08072319626808167, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "loss": 0.30835866928100586, + "step": 21560 + }, + { + "ce_loss": 0.048588644713163376, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "distill_loss": 0.15134979784488678, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "ref_ce_loss": 0.0634535551071167, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "loss": 0.41379672288894653, + "step": 21560 + }, + { + "ce_loss": 0.13131098449230194, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "distill_loss": 0.19826488196849823, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "ref_ce_loss": 0.04915918409824371, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "loss": 0.3735307455062866, + "step": 21560 + }, + { + "ce_loss": 0.005329641047865152, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "distill_loss": 0.15917302668094635, + "epoch": 7.191460973982655, + "step": 21560 + }, + { + "epoch": 7.191460973982655, + "ref_ce_loss": 0.05440036952495575, + "step": 21560 + }, + { + "epoch": 7.19479653102068, + "loss": 0.3176, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "grad_norm": 3.168907403945923, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "learning_rate": 7.899822924264104e-06, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "loss": 0.607210099697113, + "step": 21570 + }, + { + "ce_loss": 0.02107110247015953, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "distill_loss": 0.14337393641471863, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "ref_ce_loss": 0.04880295693874359, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "loss": 0.2701234519481659, + "step": 21570 + }, + { + "ce_loss": 0.024585945531725883, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "distill_loss": 0.1722574383020401, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "ref_ce_loss": 0.030505353584885597, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "loss": 0.4172288775444031, + "step": 21570 + }, + { + "ce_loss": 0.03714752942323685, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "distill_loss": 0.2899611294269562, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "ref_ce_loss": 0.06813320517539978, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "loss": 0.4019289016723633, + "step": 21570 + }, + { + "ce_loss": 0.030576540157198906, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "distill_loss": 0.28335803747177124, + "epoch": 7.19479653102068, + "step": 21570 + }, + { + "epoch": 7.19479653102068, + "ref_ce_loss": 0.06374424695968628, + "step": 21570 + }, + { + "epoch": 7.198132088058705, + "loss": 0.3242, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "grad_norm": 2.639319896697998, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "learning_rate": 7.835083082793614e-06, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "loss": 0.4229542016983032, + "step": 21580 + }, + { + "ce_loss": 0.04644731432199478, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "distill_loss": 0.2560223937034607, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "ref_ce_loss": 0.07510516792535782, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "loss": 0.5707595348358154, + "step": 21580 + }, + { + "ce_loss": 0.04119217395782471, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "distill_loss": 0.21253474056720734, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "ref_ce_loss": 0.05649447813630104, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "loss": 0.7057649493217468, + "step": 21580 + }, + { + "ce_loss": 0.027443043887615204, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "distill_loss": 0.19002290070056915, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "ref_ce_loss": 0.06741243600845337, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "loss": 0.18125252425670624, + "step": 21580 + }, + { + "ce_loss": 0.012593379244208336, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "distill_loss": 0.13432380557060242, + "epoch": 7.198132088058705, + "step": 21580 + }, + { + "epoch": 7.198132088058705, + "ref_ce_loss": 0.03388500586152077, + "step": 21580 + }, + { + "epoch": 7.201467645096731, + "loss": 0.3757, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "grad_norm": 2.6682674884796143, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "learning_rate": 7.770602494052124e-06, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "loss": 0.44627857208251953, + "step": 21590 + }, + { + "ce_loss": 0.07321686297655106, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "distill_loss": 0.29737573862075806, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "ref_ce_loss": 0.05712924897670746, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "loss": 0.23906278610229492, + "step": 21590 + }, + { + "ce_loss": 0.0435250923037529, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "distill_loss": 0.15379908680915833, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "ref_ce_loss": 0.04168463125824928, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "loss": 0.23472881317138672, + "step": 21590 + }, + { + "ce_loss": 0.039945337921381, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "distill_loss": 0.14018793404102325, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "ref_ce_loss": 0.05431056767702103, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "loss": 0.3228430449962616, + "step": 21590 + }, + { + "ce_loss": 0.003946481738239527, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "distill_loss": 0.13325072824954987, + "epoch": 7.201467645096731, + "step": 21590 + }, + { + "epoch": 7.201467645096731, + "ref_ce_loss": 0.08239027857780457, + "step": 21590 + }, + { + "epoch": 7.204803202134756, + "loss": 0.3117, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "grad_norm": 2.6161727905273438, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "learning_rate": 7.706381275626745e-06, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "loss": 0.250211626291275, + "step": 21600 + }, + { + "ce_loss": 0.01741625741124153, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "distill_loss": 0.12995892763137817, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "ref_ce_loss": 0.05979376658797264, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "loss": 0.27195289731025696, + "step": 21600 + }, + { + "ce_loss": 0.03143487870693207, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "distill_loss": 0.1863202154636383, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "ref_ce_loss": 0.0541115365922451, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "loss": 0.2668476104736328, + "step": 21600 + }, + { + "ce_loss": 0.01905851997435093, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "distill_loss": 0.1592845916748047, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "ref_ce_loss": 0.05389750003814697, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "loss": 0.28188711404800415, + "step": 21600 + }, + { + "ce_loss": 0.05415372550487518, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "distill_loss": 0.15756256878376007, + "epoch": 7.204803202134756, + "step": 21600 + }, + { + "epoch": 7.204803202134756, + "ref_ce_loss": 0.06997683644294739, + "step": 21600 + }, + { + "epoch": 7.2081387591727815, + "loss": 0.3161, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "grad_norm": 3.0321600437164307, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "learning_rate": 7.642419544631672e-06, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "loss": 0.2609192132949829, + "step": 21610 + }, + { + "ce_loss": 0.025224953889846802, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "distill_loss": 0.13914820551872253, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "ref_ce_loss": 0.06633106619119644, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "loss": 0.25504258275032043, + "step": 21610 + }, + { + "ce_loss": 0.025933396071195602, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "distill_loss": 0.1570015847682953, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "ref_ce_loss": 0.04250410944223404, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "loss": 0.3043152391910553, + "step": 21610 + }, + { + "ce_loss": 0.012817701324820518, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "distill_loss": 0.1668878197669983, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "ref_ce_loss": 0.04772022366523743, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "loss": 0.27410194277763367, + "step": 21610 + }, + { + "ce_loss": 0.004894441459327936, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "distill_loss": 0.14274558424949646, + "epoch": 7.2081387591727815, + "step": 21610 + }, + { + "epoch": 7.2081387591727815, + "ref_ce_loss": 0.04445118084549904, + "step": 21610 + }, + { + "epoch": 7.211474316210807, + "loss": 0.3263, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "grad_norm": 2.9612531661987305, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "learning_rate": 7.578717417707892e-06, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "loss": 0.2413022220134735, + "step": 21620 + }, + { + "ce_loss": 0.019957000389695168, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "distill_loss": 0.15345555543899536, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "ref_ce_loss": 0.04753858596086502, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "loss": 0.36386004090309143, + "step": 21620 + }, + { + "ce_loss": 0.02395096980035305, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "distill_loss": 0.28969570994377136, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "ref_ce_loss": 0.03671019524335861, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "loss": 0.4716953635215759, + "step": 21620 + }, + { + "ce_loss": 0.039064228534698486, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "distill_loss": 0.2576276361942291, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "ref_ce_loss": 0.04177805408835411, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "loss": 0.4198686480522156, + "step": 21620 + }, + { + "ce_loss": 0.05144977569580078, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "distill_loss": 0.2534531354904175, + "epoch": 7.211474316210807, + "step": 21620 + }, + { + "epoch": 7.211474316210807, + "ref_ce_loss": 0.07417726516723633, + "step": 21620 + }, + { + "epoch": 7.214809873248832, + "loss": 0.3098, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "grad_norm": 3.0390126705169678, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "learning_rate": 7.515275011022876e-06, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "loss": 0.2712303400039673, + "step": 21630 + }, + { + "ce_loss": 0.02337028831243515, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "distill_loss": 0.11533728986978531, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "ref_ce_loss": 0.020700674504041672, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "loss": 0.27735042572021484, + "step": 21630 + }, + { + "ce_loss": 0.047773364931344986, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "distill_loss": 0.17673635482788086, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "ref_ce_loss": 0.05261456221342087, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "loss": 0.2114078551530838, + "step": 21630 + }, + { + "ce_loss": 0.032486531883478165, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "distill_loss": 0.13928288221359253, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "ref_ce_loss": 0.02496570721268654, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "loss": 0.21625961363315582, + "step": 21630 + }, + { + "ce_loss": 0.00532973138615489, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "distill_loss": 0.1723240315914154, + "epoch": 7.214809873248832, + "step": 21630 + }, + { + "epoch": 7.214809873248832, + "ref_ce_loss": 0.03841087222099304, + "step": 21630 + }, + { + "epoch": 7.2181454302868575, + "loss": 0.3027, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "grad_norm": 3.580503225326538, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "learning_rate": 7.452092440270646e-06, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "loss": 0.29037898778915405, + "step": 21640 + }, + { + "ce_loss": 0.0321304053068161, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "distill_loss": 0.14601705968379974, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "ref_ce_loss": 0.05759825184941292, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "loss": 0.5272622108459473, + "step": 21640 + }, + { + "ce_loss": 0.1542384922504425, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "distill_loss": 0.16737566888332367, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "ref_ce_loss": 0.07182739675045013, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "loss": 0.26996082067489624, + "step": 21640 + }, + { + "ce_loss": 0.02072783373296261, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "distill_loss": 0.1619803011417389, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "ref_ce_loss": 0.05954483151435852, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "loss": 0.40388885140419006, + "step": 21640 + }, + { + "ce_loss": 0.02044593170285225, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "distill_loss": 0.14890506863594055, + "epoch": 7.2181454302868575, + "step": 21640 + }, + { + "epoch": 7.2181454302868575, + "ref_ce_loss": 0.06344384700059891, + "step": 21640 + }, + { + "epoch": 7.221480987324883, + "loss": 0.3518, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "grad_norm": 3.95499587059021, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "learning_rate": 7.38916982067122e-06, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "loss": 0.30346325039863586, + "step": 21650 + }, + { + "ce_loss": 0.06752192229032516, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "distill_loss": 0.15714608132839203, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "ref_ce_loss": 0.04498041421175003, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "loss": 0.3442494869232178, + "step": 21650 + }, + { + "ce_loss": 0.016071105375885963, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "distill_loss": 0.2575092315673828, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "ref_ce_loss": 0.051779165863990784, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "loss": 0.38894587755203247, + "step": 21650 + }, + { + "ce_loss": 0.01721816696226597, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "distill_loss": 0.24018141627311707, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "ref_ce_loss": 0.05405234917998314, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "loss": 0.32709309458732605, + "step": 21650 + }, + { + "ce_loss": 0.028886908665299416, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "distill_loss": 0.16045019030570984, + "epoch": 7.221480987324883, + "step": 21650 + }, + { + "epoch": 7.221480987324883, + "ref_ce_loss": 0.04315178468823433, + "step": 21650 + }, + { + "epoch": 7.224816544362908, + "loss": 0.3514, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "grad_norm": 4.426061630249023, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "learning_rate": 7.326507266970677e-06, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "loss": 0.26750248670578003, + "step": 21660 + }, + { + "ce_loss": 0.023527834564447403, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "distill_loss": 0.18002349138259888, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "ref_ce_loss": 0.04690827801823616, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "loss": 0.6288806200027466, + "step": 21660 + }, + { + "ce_loss": 0.07400006800889969, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "distill_loss": 0.20145976543426514, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "ref_ce_loss": 0.034665659070014954, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "loss": 0.2678120732307434, + "step": 21660 + }, + { + "ce_loss": 0.038052450865507126, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "distill_loss": 0.16327275335788727, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "ref_ce_loss": 0.04916460067033768, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "loss": 0.2479705512523651, + "step": 21660 + }, + { + "ce_loss": 0.01944034732878208, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "distill_loss": 0.15548597276210785, + "epoch": 7.224816544362908, + "step": 21660 + }, + { + "epoch": 7.224816544362908, + "ref_ce_loss": 0.05601424351334572, + "step": 21660 + }, + { + "epoch": 7.228152101400934, + "loss": 0.3284, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "grad_norm": 3.802701234817505, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "learning_rate": 7.264104893440792e-06, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "loss": 0.4184314012527466, + "step": 21670 + }, + { + "ce_loss": 0.03756963461637497, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "distill_loss": 0.2632547616958618, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "ref_ce_loss": 0.04559790715575218, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "loss": 0.21939358115196228, + "step": 21670 + }, + { + "ce_loss": 0.027253881096839905, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "distill_loss": 0.09987346082925797, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "ref_ce_loss": 0.03330458328127861, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "loss": 0.7001844644546509, + "step": 21670 + }, + { + "ce_loss": 0.03261253982782364, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "distill_loss": 0.12179021537303925, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "ref_ce_loss": 0.03093714825809002, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "loss": 0.4727906286716461, + "step": 21670 + }, + { + "ce_loss": 0.1115996390581131, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "distill_loss": 0.2063658982515335, + "epoch": 7.228152101400934, + "step": 21670 + }, + { + "epoch": 7.228152101400934, + "ref_ce_loss": 0.04917268455028534, + "step": 21670 + }, + { + "epoch": 7.231487658438959, + "loss": 0.3376, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "grad_norm": 2.850815534591675, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "learning_rate": 7.201962813878837e-06, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "loss": 0.322729229927063, + "step": 21680 + }, + { + "ce_loss": 0.013876276090741158, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "distill_loss": 0.14954251050949097, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "ref_ce_loss": 0.057617779821157455, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "loss": 0.22298645973205566, + "step": 21680 + }, + { + "ce_loss": 0.02905522659420967, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "distill_loss": 0.13507865369319916, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "ref_ce_loss": 0.043622229248285294, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "loss": 0.25708311796188354, + "step": 21680 + }, + { + "ce_loss": 0.03127783536911011, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "distill_loss": 0.14482389390468597, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "ref_ce_loss": 0.052324190735816956, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "loss": 0.4473877549171448, + "step": 21680 + }, + { + "ce_loss": 0.034591808915138245, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "distill_loss": 0.16112400591373444, + "epoch": 7.231487658438959, + "step": 21680 + }, + { + "epoch": 7.231487658438959, + "ref_ce_loss": 0.05375932529568672, + "step": 21680 + }, + { + "epoch": 7.234823215476984, + "loss": 0.3232, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "grad_norm": 2.6115386486053467, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "learning_rate": 7.140081141607479e-06, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "loss": 0.2910879850387573, + "step": 21690 + }, + { + "ce_loss": 0.02544204518198967, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "distill_loss": 0.21739305555820465, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "ref_ce_loss": 0.04808683693408966, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "loss": 0.36207371950149536, + "step": 21690 + }, + { + "ce_loss": 0.03599226474761963, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "distill_loss": 0.19642367959022522, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "ref_ce_loss": 0.0867382362484932, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "loss": 0.44683051109313965, + "step": 21690 + }, + { + "ce_loss": 0.10506901890039444, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "distill_loss": 0.2525382936000824, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "ref_ce_loss": 0.06445559859275818, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "loss": 0.28379279375076294, + "step": 21690 + }, + { + "ce_loss": 0.024022918194532394, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "distill_loss": 0.1875816434621811, + "epoch": 7.234823215476984, + "step": 21690 + }, + { + "epoch": 7.234823215476984, + "ref_ce_loss": 0.025708317756652832, + "step": 21690 + }, + { + "epoch": 7.23815877251501, + "loss": 0.3608, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "grad_norm": 3.2080724239349365, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "learning_rate": 7.0784599894745e-06, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "loss": 0.34930747747421265, + "step": 21700 + }, + { + "ce_loss": 0.052470266819000244, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "distill_loss": 0.1867637187242508, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "ref_ce_loss": 0.04565154388546944, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "loss": 0.3086041510105133, + "step": 21700 + }, + { + "ce_loss": 0.03938201069831848, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "distill_loss": 0.20732665061950684, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "ref_ce_loss": 0.05089586228132248, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "loss": 0.4481806755065918, + "step": 21700 + }, + { + "ce_loss": 0.07399517297744751, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "distill_loss": 0.14980250597000122, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "ref_ce_loss": 0.05937043949961662, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "loss": 0.24760335683822632, + "step": 21700 + }, + { + "ce_loss": 0.009519390761852264, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "distill_loss": 0.15549524128437042, + "epoch": 7.23815877251501, + "step": 21700 + }, + { + "epoch": 7.23815877251501, + "ref_ce_loss": 0.056334398686885834, + "step": 21700 + }, + { + "epoch": 7.241494329553035, + "loss": 0.3446, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "grad_norm": 3.619967460632324, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "learning_rate": 7.0170994698525274e-06, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "loss": 0.28680312633514404, + "step": 21710 + }, + { + "ce_loss": 0.05779293179512024, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "distill_loss": 0.1770837903022766, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "ref_ce_loss": 0.03173800930380821, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "loss": 0.17028909921646118, + "step": 21710 + }, + { + "ce_loss": 0.010208682157099247, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "distill_loss": 0.11676836013793945, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "ref_ce_loss": 0.042866289615631104, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "loss": 0.22058066725730896, + "step": 21710 + }, + { + "ce_loss": 0.02043846808373928, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "distill_loss": 0.1391141414642334, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "ref_ce_loss": 0.043113306164741516, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "loss": 0.2991776168346405, + "step": 21710 + }, + { + "ce_loss": 0.019159501418471336, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "distill_loss": 0.20416052639484406, + "epoch": 7.241494329553035, + "step": 21710 + }, + { + "epoch": 7.241494329553035, + "ref_ce_loss": 0.03246960788965225, + "step": 21710 + }, + { + "epoch": 7.24482988659106, + "loss": 0.343, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "grad_norm": 4.5923380851745605, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "learning_rate": 6.955999694639003e-06, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "loss": 0.3403356969356537, + "step": 21720 + }, + { + "ce_loss": 0.0032351722475141287, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "distill_loss": 0.16471640765666962, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "ref_ce_loss": 0.059697773307561874, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "loss": 0.24258598685264587, + "step": 21720 + }, + { + "ce_loss": 0.0037439356092363596, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "distill_loss": 0.1754499077796936, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "ref_ce_loss": 0.030157914385199547, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "loss": 0.1979200690984726, + "step": 21720 + }, + { + "ce_loss": 0.0022134960163384676, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "distill_loss": 0.1415894627571106, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "ref_ce_loss": 0.023886706680059433, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "loss": 0.5604916214942932, + "step": 21720 + }, + { + "ce_loss": 0.03827451169490814, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "distill_loss": 0.22929227352142334, + "epoch": 7.24482988659106, + "step": 21720 + }, + { + "epoch": 7.24482988659106, + "ref_ce_loss": 0.07590640336275101, + "step": 21720 + }, + { + "epoch": 7.248165443629086, + "loss": 0.3597, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "grad_norm": 3.0626065731048584, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "learning_rate": 6.895160775255764e-06, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "loss": 0.16732649505138397, + "step": 21730 + }, + { + "ce_loss": 0.013631356880068779, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "distill_loss": 0.12075482308864594, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "ref_ce_loss": 0.022803358733654022, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "loss": 0.24672572314739227, + "step": 21730 + }, + { + "ce_loss": 0.010074594989418983, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "distill_loss": 0.16110017895698547, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "ref_ce_loss": 0.0509311780333519, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "loss": 0.8778437376022339, + "step": 21730 + }, + { + "ce_loss": 0.0628712847828865, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "distill_loss": 0.38136518001556396, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "ref_ce_loss": 0.06170080602169037, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "loss": 0.8466122150421143, + "step": 21730 + }, + { + "ce_loss": 0.08081597089767456, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "distill_loss": 0.15677189826965332, + "epoch": 7.248165443629086, + "step": 21730 + }, + { + "epoch": 7.248165443629086, + "ref_ce_loss": 0.05595485121011734, + "step": 21730 + }, + { + "epoch": 7.251501000667111, + "loss": 0.359, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "grad_norm": 3.6847281455993652, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "learning_rate": 6.834582822649015e-06, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "loss": 0.27559083700180054, + "step": 21740 + }, + { + "ce_loss": 0.03012896329164505, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "distill_loss": 0.13183234632015228, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "ref_ce_loss": 0.045790113508701324, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "loss": 0.4375584125518799, + "step": 21740 + }, + { + "ce_loss": 0.0076003712601959705, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "distill_loss": 0.1997174620628357, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "ref_ce_loss": 0.07456426322460175, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "loss": 0.2726700007915497, + "step": 21740 + }, + { + "ce_loss": 0.03593643754720688, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "distill_loss": 0.11179260164499283, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "ref_ce_loss": 0.04381109029054642, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "loss": 0.42131561040878296, + "step": 21740 + }, + { + "ce_loss": 0.06356733292341232, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "distill_loss": 0.1883009672164917, + "epoch": 7.251501000667111, + "step": 21740 + }, + { + "epoch": 7.251501000667111, + "ref_ce_loss": 0.06666535139083862, + "step": 21740 + }, + { + "epoch": 7.254836557705136, + "loss": 0.3445, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "grad_norm": 3.5137743949890137, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "learning_rate": 6.774265947289053e-06, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "loss": 0.4460420310497284, + "step": 21750 + }, + { + "ce_loss": 0.07431213557720184, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "distill_loss": 0.26951563358306885, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "ref_ce_loss": 0.05519580841064453, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "loss": 0.23930570483207703, + "step": 21750 + }, + { + "ce_loss": 0.02638310380280018, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "distill_loss": 0.11300281435251236, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "ref_ce_loss": 0.051546212285757065, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "loss": 0.41818490624427795, + "step": 21750 + }, + { + "ce_loss": 0.019194016233086586, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "distill_loss": 0.2620835602283478, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "ref_ce_loss": 0.05575841665267944, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "loss": 0.2405170053243637, + "step": 21750 + }, + { + "ce_loss": 0.01714288257062435, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "distill_loss": 0.1712266355752945, + "epoch": 7.254836557705136, + "step": 21750 + }, + { + "epoch": 7.254836557705136, + "ref_ce_loss": 0.05204594135284424, + "step": 21750 + }, + { + "epoch": 7.258172114743162, + "loss": 0.3298, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "grad_norm": 2.681328058242798, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "learning_rate": 6.7142102591700606e-06, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "loss": 0.541475772857666, + "step": 21760 + }, + { + "ce_loss": 0.018261613324284554, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "distill_loss": 0.3466701805591583, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "ref_ce_loss": 0.05522928014397621, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "loss": 0.33088329434394836, + "step": 21760 + }, + { + "ce_loss": 0.018995869904756546, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "distill_loss": 0.14749498665332794, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "ref_ce_loss": 0.03270300105214119, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "loss": 0.3270573318004608, + "step": 21760 + }, + { + "ce_loss": 0.050677765160799026, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "distill_loss": 0.19777001440525055, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "ref_ce_loss": 0.0531977117061615, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "loss": 0.35968199372291565, + "step": 21760 + }, + { + "ce_loss": 0.0543878972530365, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "distill_loss": 0.1706765741109848, + "epoch": 7.258172114743162, + "step": 21760 + }, + { + "epoch": 7.258172114743162, + "ref_ce_loss": 0.050895582884550095, + "step": 21760 + }, + { + "epoch": 7.261507671781187, + "loss": 0.3198, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "grad_norm": 3.674339532852173, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "learning_rate": 6.6544158678099476e-06, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "loss": 0.3187209367752075, + "step": 21770 + }, + { + "ce_loss": 0.005656755529344082, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "distill_loss": 0.20920640230178833, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "ref_ce_loss": 0.05724534019827843, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "loss": 0.6237984895706177, + "step": 21770 + }, + { + "ce_loss": 0.007133541628718376, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "distill_loss": 0.20287460088729858, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "ref_ce_loss": 0.05001780763268471, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "loss": 0.38897567987442017, + "step": 21770 + }, + { + "ce_loss": 0.0849277600646019, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "distill_loss": 0.21830356121063232, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "ref_ce_loss": 0.05569655820727348, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "loss": 0.32482588291168213, + "step": 21770 + }, + { + "ce_loss": 0.012219560332596302, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "distill_loss": 0.19683362543582916, + "epoch": 7.261507671781187, + "step": 21770 + }, + { + "epoch": 7.261507671781187, + "ref_ce_loss": 0.04783269762992859, + "step": 21770 + }, + { + "epoch": 7.264843228819212, + "loss": 0.3182, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "grad_norm": 4.115479946136475, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "learning_rate": 6.594882882250041e-06, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "loss": 0.2964266538619995, + "step": 21780 + }, + { + "ce_loss": 0.03865527734160423, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "distill_loss": 0.15016742050647736, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "ref_ce_loss": 0.03687658905982971, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "loss": 0.3021934926509857, + "step": 21780 + }, + { + "ce_loss": 0.022084327414631844, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "distill_loss": 0.20831188559532166, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "ref_ce_loss": 0.07171301543712616, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "loss": 0.331741601228714, + "step": 21780 + }, + { + "ce_loss": 0.055888574570417404, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "distill_loss": 0.22407200932502747, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "ref_ce_loss": 0.051574043929576874, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "loss": 0.24614335596561432, + "step": 21780 + }, + { + "ce_loss": 0.028283387422561646, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "distill_loss": 0.1770130693912506, + "epoch": 7.264843228819212, + "step": 21780 + }, + { + "epoch": 7.264843228819212, + "ref_ce_loss": 0.04076956957578659, + "step": 21780 + }, + { + "epoch": 7.268178785857238, + "loss": 0.3231, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "grad_norm": 3.716766595840454, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "learning_rate": 6.535611411055064e-06, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "loss": 0.4391646981239319, + "step": 21790 + }, + { + "ce_loss": 0.0490754172205925, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "distill_loss": 0.27585989236831665, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "ref_ce_loss": 0.07593761384487152, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "loss": 0.3659456968307495, + "step": 21790 + }, + { + "ce_loss": 0.03586322441697121, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "distill_loss": 0.2670363783836365, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "ref_ce_loss": 0.06285092234611511, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "loss": 0.34236231446266174, + "step": 21790 + }, + { + "ce_loss": 0.06531081348657608, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "distill_loss": 0.1928083449602127, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "ref_ce_loss": 0.04548466578125954, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "loss": 0.3572752773761749, + "step": 21790 + }, + { + "ce_loss": 0.04397887364029884, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "distill_loss": 0.16652613878250122, + "epoch": 7.268178785857238, + "step": 21790 + }, + { + "epoch": 7.268178785857238, + "ref_ce_loss": 0.05783737450838089, + "step": 21790 + }, + { + "epoch": 7.271514342895263, + "loss": 0.3356, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "grad_norm": 3.6929001808166504, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "learning_rate": 6.476601562312788e-06, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "loss": 0.41686001420021057, + "step": 21800 + }, + { + "ce_loss": 0.10789791494607925, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "distill_loss": 0.2107146978378296, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "ref_ce_loss": 0.052300117909908295, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "loss": 0.3694479465484619, + "step": 21800 + }, + { + "ce_loss": 0.03740653023123741, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "distill_loss": 0.16590005159378052, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "ref_ce_loss": 0.07728464156389236, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "loss": 0.2800734043121338, + "step": 21800 + }, + { + "ce_loss": 0.04555989429354668, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "distill_loss": 0.1753097027540207, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "ref_ce_loss": 0.04043455794453621, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "loss": 0.2379513531923294, + "step": 21800 + }, + { + "ce_loss": 0.01604171097278595, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "distill_loss": 0.13487288355827332, + "epoch": 7.271514342895263, + "step": 21800 + }, + { + "epoch": 7.271514342895263, + "ref_ce_loss": 0.04081754758954048, + "step": 21800 + }, + { + "epoch": 7.2748498999332885, + "loss": 0.3518, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "grad_norm": 2.9927523136138916, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "learning_rate": 6.417853443633902e-06, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "loss": 0.2745986580848694, + "step": 21810 + }, + { + "ce_loss": 0.09028304368257523, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "distill_loss": 0.14296692609786987, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "ref_ce_loss": 0.03005700558423996, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "loss": 0.4135947525501251, + "step": 21810 + }, + { + "ce_loss": 0.023570353165268898, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "distill_loss": 0.33251529932022095, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "ref_ce_loss": 0.05725204572081566, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "loss": 0.21228575706481934, + "step": 21810 + }, + { + "ce_loss": 0.01579027622938156, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "distill_loss": 0.14295357465744019, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "ref_ce_loss": 0.05342310294508934, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "loss": 0.28802329301834106, + "step": 21810 + }, + { + "ce_loss": 0.02745390497148037, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "distill_loss": 0.1880541741847992, + "epoch": 7.2748498999332885, + "step": 21810 + }, + { + "epoch": 7.2748498999332885, + "ref_ce_loss": 0.02855663187801838, + "step": 21810 + }, + { + "epoch": 7.278185456971314, + "loss": 0.3351, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "grad_norm": 3.009716510772705, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "learning_rate": 6.359367162151824e-06, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "loss": 0.3255745768547058, + "step": 21820 + }, + { + "ce_loss": 0.07791402190923691, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "distill_loss": 0.17362235486507416, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "ref_ce_loss": 0.07391360402107239, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "loss": 0.3911256492137909, + "step": 21820 + }, + { + "ce_loss": 0.027354300022125244, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "distill_loss": 0.3045755922794342, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "ref_ce_loss": 0.03816691413521767, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "loss": 0.5183221101760864, + "step": 21820 + }, + { + "ce_loss": 0.03627387061715126, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "distill_loss": 0.17436179518699646, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "ref_ce_loss": 0.06194957718253136, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "loss": 0.27898895740509033, + "step": 21820 + }, + { + "ce_loss": 0.0026024733670055866, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "distill_loss": 0.22231397032737732, + "epoch": 7.278185456971314, + "step": 21820 + }, + { + "epoch": 7.278185456971314, + "ref_ce_loss": 0.035759177058935165, + "step": 21820 + }, + { + "epoch": 7.281521014009339, + "loss": 0.3531, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "grad_norm": 3.047001838684082, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "learning_rate": 6.30114282452242e-06, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "loss": 0.4379478096961975, + "step": 21830 + }, + { + "ce_loss": 0.08663278818130493, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "distill_loss": 0.305935800075531, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "ref_ce_loss": 0.04534162953495979, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "loss": 0.4171701669692993, + "step": 21830 + }, + { + "ce_loss": 0.01691397652029991, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "distill_loss": 0.18893980979919434, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "ref_ce_loss": 0.04799644649028778, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "loss": 0.38544657826423645, + "step": 21830 + }, + { + "ce_loss": 0.023883184418082237, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "distill_loss": 0.256287544965744, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "ref_ce_loss": 0.046406473964452744, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "loss": 0.336747407913208, + "step": 21830 + }, + { + "ce_loss": 0.0368281789124012, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "distill_loss": 0.22192564606666565, + "epoch": 7.281521014009339, + "step": 21830 + }, + { + "epoch": 7.281521014009339, + "ref_ce_loss": 0.05088139325380325, + "step": 21830 + }, + { + "epoch": 7.2848565710473645, + "loss": 0.3154, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "grad_norm": 2.9419236183166504, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "learning_rate": 6.243180536923925e-06, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "loss": 0.21947017312049866, + "step": 21840 + }, + { + "ce_loss": 0.045658010989427567, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "distill_loss": 0.1260603815317154, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "ref_ce_loss": 0.0476338192820549, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "loss": 0.2900714576244354, + "step": 21840 + }, + { + "ce_loss": 0.012831090949475765, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "distill_loss": 0.21750670671463013, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "ref_ce_loss": 0.05956600233912468, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "loss": 0.3997568190097809, + "step": 21840 + }, + { + "ce_loss": 0.0886455699801445, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "distill_loss": 0.20969116687774658, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "ref_ce_loss": 0.07083108276128769, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "loss": 0.4463040232658386, + "step": 21840 + }, + { + "ce_loss": 0.08025004714727402, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "distill_loss": 0.2879204750061035, + "epoch": 7.2848565710473645, + "step": 21840 + }, + { + "epoch": 7.2848565710473645, + "ref_ce_loss": 0.06017610430717468, + "step": 21840 + }, + { + "epoch": 7.28819212808539, + "loss": 0.3874, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "grad_norm": 3.621523380279541, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "learning_rate": 6.185480405056686e-06, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "loss": 0.3507860004901886, + "step": 21850 + }, + { + "ce_loss": 0.007834719493985176, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "distill_loss": 0.12249279022216797, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "ref_ce_loss": 0.06363532692193985, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "loss": 0.39529716968536377, + "step": 21850 + }, + { + "ce_loss": 0.03328666463494301, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "distill_loss": 0.1468905508518219, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "ref_ce_loss": 0.0494842529296875, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "loss": 0.28177881240844727, + "step": 21850 + }, + { + "ce_loss": 0.07029473036527634, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "distill_loss": 0.16273248195648193, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "ref_ce_loss": 0.04860760644078255, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "loss": 0.16786929965019226, + "step": 21850 + }, + { + "ce_loss": 0.0027700222562998533, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "distill_loss": 0.12942123413085938, + "epoch": 7.28819212808539, + "step": 21850 + }, + { + "epoch": 7.28819212808539, + "ref_ce_loss": 0.035599589347839355, + "step": 21850 + }, + { + "epoch": 7.291527685123415, + "loss": 0.3397, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "grad_norm": 3.5767362117767334, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "learning_rate": 6.128042534143002e-06, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "loss": 0.3591082692146301, + "step": 21860 + }, + { + "ce_loss": 0.012450255453586578, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "distill_loss": 0.20948486030101776, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "ref_ce_loss": 0.042204998433589935, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "loss": 0.3279259204864502, + "step": 21860 + }, + { + "ce_loss": 0.04447029158473015, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "distill_loss": 0.21123427152633667, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "ref_ce_loss": 0.05294876918196678, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "loss": 0.3567955195903778, + "step": 21860 + }, + { + "ce_loss": 0.08325055241584778, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "distill_loss": 0.16381686925888062, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "ref_ce_loss": 0.07084712386131287, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "loss": 0.22216692566871643, + "step": 21860 + }, + { + "ce_loss": 0.03289766609668732, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "distill_loss": 0.15384438633918762, + "epoch": 7.291527685123415, + "step": 21860 + }, + { + "epoch": 7.291527685123415, + "ref_ce_loss": 0.02933252416551113, + "step": 21860 + }, + { + "epoch": 7.2948632421614406, + "loss": 0.3305, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "grad_norm": 5.346068859100342, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "learning_rate": 6.070867028926868e-06, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "loss": 0.42858025431632996, + "step": 21870 + }, + { + "ce_loss": 0.023396633565425873, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "distill_loss": 0.30239880084991455, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "ref_ce_loss": 0.07331034541130066, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "loss": 0.3239220678806305, + "step": 21870 + }, + { + "ce_loss": 0.009180337190628052, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "distill_loss": 0.13598200678825378, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "ref_ce_loss": 0.05417673662304878, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "loss": 0.21557603776454926, + "step": 21870 + }, + { + "ce_loss": 0.012299394235014915, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "distill_loss": 0.12474885582923889, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "ref_ce_loss": 0.05441064387559891, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "loss": 0.32964056730270386, + "step": 21870 + }, + { + "ce_loss": 0.05307295173406601, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "distill_loss": 0.17791429162025452, + "epoch": 7.2948632421614406, + "step": 21870 + }, + { + "epoch": 7.2948632421614406, + "ref_ce_loss": 0.042303744703531265, + "step": 21870 + }, + { + "epoch": 7.298198799199466, + "loss": 0.327, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "grad_norm": 2.8577730655670166, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "learning_rate": 6.0139539936738975e-06, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "loss": 0.3614603877067566, + "step": 21880 + }, + { + "ce_loss": 0.1079586073756218, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "distill_loss": 0.1908852756023407, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "ref_ce_loss": 0.04335471987724304, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "loss": 0.2224392294883728, + "step": 21880 + }, + { + "ce_loss": 0.04111739993095398, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "distill_loss": 0.12654203176498413, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "ref_ce_loss": 0.0440577007830143, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "loss": 0.24377888441085815, + "step": 21880 + }, + { + "ce_loss": 0.02872404456138611, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "distill_loss": 0.11825037002563477, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "ref_ce_loss": 0.03949522599577904, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "loss": 0.6034325957298279, + "step": 21880 + }, + { + "ce_loss": 0.023845089599490166, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "distill_loss": 0.22611646354198456, + "epoch": 7.298198799199466, + "step": 21880 + }, + { + "epoch": 7.298198799199466, + "ref_ce_loss": 0.04899800941348076, + "step": 21880 + }, + { + "epoch": 7.301534356237491, + "loss": 0.3174, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "grad_norm": 2.451704740524292, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "learning_rate": 5.9573035321709535e-06, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "loss": 0.254489928483963, + "step": 21890 + }, + { + "ce_loss": 0.017234718427062035, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "distill_loss": 0.18709203600883484, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "ref_ce_loss": 0.03838713467121124, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "loss": 0.5210539102554321, + "step": 21890 + }, + { + "ce_loss": 0.010104808956384659, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "distill_loss": 0.1786583811044693, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "ref_ce_loss": 0.08645079284906387, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "loss": 0.5077769756317139, + "step": 21890 + }, + { + "ce_loss": 0.04721241444349289, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "distill_loss": 0.12380172312259674, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "ref_ce_loss": 0.06811454147100449, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "loss": 0.4189784526824951, + "step": 21890 + }, + { + "ce_loss": 0.024810247123241425, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "distill_loss": 0.3204716444015503, + "epoch": 7.301534356237491, + "step": 21890 + }, + { + "epoch": 7.301534356237491, + "ref_ce_loss": 0.04616758972406387, + "step": 21890 + }, + { + "epoch": 7.304869913275517, + "loss": 0.3187, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "grad_norm": 2.8359055519104004, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "learning_rate": 5.900915747726182e-06, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "loss": 0.2653408348560333, + "step": 21900 + }, + { + "ce_loss": 0.034685708582401276, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "distill_loss": 0.1382124423980713, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "ref_ce_loss": 0.05031518265604973, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "loss": 0.3756208121776581, + "step": 21900 + }, + { + "ce_loss": 0.062474705278873444, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "distill_loss": 0.1996249258518219, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "ref_ce_loss": 0.06248785927891731, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "loss": 0.3942401111125946, + "step": 21900 + }, + { + "ce_loss": 0.017181461676955223, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "distill_loss": 0.26439082622528076, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "ref_ce_loss": 0.0879964753985405, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "loss": 0.29367902874946594, + "step": 21900 + }, + { + "ce_loss": 0.021005744114518166, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "distill_loss": 0.1865503340959549, + "epoch": 7.304869913275517, + "step": 21900 + }, + { + "epoch": 7.304869913275517, + "ref_ce_loss": 0.06075814738869667, + "step": 21900 + }, + { + "epoch": 7.308205470313542, + "loss": 0.3228, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "grad_norm": 5.696154594421387, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "learning_rate": 5.844790743168593e-06, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "loss": 0.5747356414794922, + "step": 21910 + }, + { + "ce_loss": 0.04167019948363304, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "distill_loss": 0.376578688621521, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "ref_ce_loss": 0.061236023902893066, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "loss": 0.2941112816333771, + "step": 21910 + }, + { + "ce_loss": 0.03991208225488663, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "distill_loss": 0.17364008724689484, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "ref_ce_loss": 0.04996495693922043, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "loss": 0.5141149759292603, + "step": 21910 + }, + { + "ce_loss": 0.021073229610919952, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "distill_loss": 0.308474600315094, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "ref_ce_loss": 0.03844856843352318, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "loss": 0.3157500922679901, + "step": 21910 + }, + { + "ce_loss": 0.0359286367893219, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "distill_loss": 0.1874678134918213, + "epoch": 7.308205470313542, + "step": 21910 + }, + { + "epoch": 7.308205470313542, + "ref_ce_loss": 0.04697670415043831, + "step": 21910 + }, + { + "epoch": 7.311541027351567, + "loss": 0.3232, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "grad_norm": 3.316659688949585, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "learning_rate": 5.788928620848115e-06, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "loss": 0.3719853460788727, + "step": 21920 + }, + { + "ce_loss": 0.046290088444948196, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "distill_loss": 0.19583165645599365, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "ref_ce_loss": 0.06114131957292557, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "loss": 0.3432145118713379, + "step": 21920 + }, + { + "ce_loss": 0.02922457829117775, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "distill_loss": 0.2272334098815918, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "ref_ce_loss": 0.0575200691819191, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "loss": 0.3053845763206482, + "step": 21920 + }, + { + "ce_loss": 0.010052576661109924, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "distill_loss": 0.15204855799674988, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "ref_ce_loss": 0.043563369661569595, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "loss": 0.3522624671459198, + "step": 21920 + }, + { + "ce_loss": 0.04388388246297836, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "distill_loss": 0.2134374976158142, + "epoch": 7.311541027351567, + "step": 21920 + }, + { + "epoch": 7.311541027351567, + "ref_ce_loss": 0.06627210229635239, + "step": 21920 + }, + { + "epoch": 7.314876584389593, + "loss": 0.3748, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "grad_norm": 3.1502254009246826, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "learning_rate": 5.73332948263523e-06, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "loss": 0.6507969498634338, + "step": 21930 + }, + { + "ce_loss": 0.005436472594738007, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "distill_loss": 0.13547836244106293, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "ref_ce_loss": 0.03680480644106865, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "loss": 0.26195141673088074, + "step": 21930 + }, + { + "ce_loss": 0.026195021346211433, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "distill_loss": 0.1941198855638504, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "ref_ce_loss": 0.0410786047577858, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "loss": 0.27653250098228455, + "step": 21930 + }, + { + "ce_loss": 0.0419037900865078, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "distill_loss": 0.18454256653785706, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "ref_ce_loss": 0.027803665027022362, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "loss": 0.4033946096897125, + "step": 21930 + }, + { + "ce_loss": 0.0186313409358263, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "distill_loss": 0.2735311686992645, + "epoch": 7.314876584389593, + "step": 21930 + }, + { + "epoch": 7.314876584389593, + "ref_ce_loss": 0.05757341533899307, + "step": 21930 + }, + { + "epoch": 7.318212141427618, + "loss": 0.3289, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "grad_norm": 3.537044048309326, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "learning_rate": 5.677993429920796e-06, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "loss": 0.28274813294410706, + "step": 21940 + }, + { + "ce_loss": 0.025311071425676346, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "distill_loss": 0.20105645060539246, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "ref_ce_loss": 0.05622454360127449, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "loss": 0.3809490501880646, + "step": 21940 + }, + { + "ce_loss": 0.012795322574675083, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "distill_loss": 0.27297529578208923, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "ref_ce_loss": 0.06003205478191376, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "loss": 0.16590841114521027, + "step": 21940 + }, + { + "ce_loss": 0.026391826570034027, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "distill_loss": 0.0870550125837326, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "ref_ce_loss": 0.033449750393629074, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "loss": 0.36872372031211853, + "step": 21940 + }, + { + "ce_loss": 0.02109898068010807, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "distill_loss": 0.25131669640541077, + "epoch": 7.318212141427618, + "step": 21940 + }, + { + "epoch": 7.318212141427618, + "ref_ce_loss": 0.03654967620968819, + "step": 21940 + }, + { + "epoch": 7.321547698465643, + "loss": 0.3165, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "grad_norm": 2.954684257507324, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "learning_rate": 5.6229205636159794e-06, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "loss": 0.44153302907943726, + "step": 21950 + }, + { + "ce_loss": 0.015158475376665592, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "distill_loss": 0.1195288747549057, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "ref_ce_loss": 0.0664001852273941, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "loss": 0.22423836588859558, + "step": 21950 + }, + { + "ce_loss": 0.011930732056498528, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "distill_loss": 0.17623679339885712, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "ref_ce_loss": 0.0360279381275177, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "loss": 0.2697838544845581, + "step": 21950 + }, + { + "ce_loss": 0.010306882672011852, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "distill_loss": 0.19858577847480774, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "ref_ce_loss": 0.047593116760253906, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "loss": 0.2993145287036896, + "step": 21950 + }, + { + "ce_loss": 0.055308952927589417, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "distill_loss": 0.15778449177742004, + "epoch": 7.321547698465643, + "step": 21950 + }, + { + "epoch": 7.321547698465643, + "ref_ce_loss": 0.06056550145149231, + "step": 21950 + }, + { + "epoch": 7.324883255503669, + "loss": 0.3201, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "grad_norm": 2.9763293266296387, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "learning_rate": 5.568110984151925e-06, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "loss": 0.40113523602485657, + "step": 21960 + }, + { + "ce_loss": 0.04013872146606445, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "distill_loss": 0.23629052937030792, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "ref_ce_loss": 0.055267203599214554, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "loss": 0.2200232297182083, + "step": 21960 + }, + { + "ce_loss": 0.006899657659232616, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "distill_loss": 0.13835325837135315, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "ref_ce_loss": 0.03750453144311905, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "loss": 0.34387919306755066, + "step": 21960 + }, + { + "ce_loss": 0.05290165916085243, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "distill_loss": 0.1321592777967453, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "ref_ce_loss": 0.07124420255422592, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "loss": 0.364957332611084, + "step": 21960 + }, + { + "ce_loss": 0.03829869255423546, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "distill_loss": 0.15634045004844666, + "epoch": 7.324883255503669, + "step": 21960 + }, + { + "epoch": 7.324883255503669, + "ref_ce_loss": 0.044923074543476105, + "step": 21960 + }, + { + "epoch": 7.328218812541694, + "loss": 0.3256, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "grad_norm": 4.486271381378174, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "learning_rate": 5.513564791479697e-06, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "loss": 0.28296107053756714, + "step": 21970 + }, + { + "ce_loss": 0.02217467688024044, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "distill_loss": 0.20849867165088654, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "ref_ce_loss": 0.04285474866628647, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "loss": 0.35586079955101013, + "step": 21970 + }, + { + "ce_loss": 0.03993077948689461, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "distill_loss": 0.20565329492092133, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "ref_ce_loss": 0.056240327656269073, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "loss": 0.41194766759872437, + "step": 21970 + }, + { + "ce_loss": 0.08907749503850937, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "distill_loss": 0.21700620651245117, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "ref_ce_loss": 0.060150161385536194, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "loss": 0.36081603169441223, + "step": 21970 + }, + { + "ce_loss": 0.010890948586165905, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "distill_loss": 0.12495280802249908, + "epoch": 7.328218812541694, + "step": 21970 + }, + { + "epoch": 7.328218812541694, + "ref_ce_loss": 0.03179723769426346, + "step": 21970 + }, + { + "epoch": 7.331554369579719, + "loss": 0.3436, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "grad_norm": 2.896735429763794, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "learning_rate": 5.45928208507006e-06, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "loss": 0.38351812958717346, + "step": 21980 + }, + { + "ce_loss": 0.01840709149837494, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "distill_loss": 0.24827256798744202, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "ref_ce_loss": 0.05634456127882004, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "loss": 0.25412923097610474, + "step": 21980 + }, + { + "ce_loss": 0.03049309179186821, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "distill_loss": 0.15272663533687592, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "ref_ce_loss": 0.038149669766426086, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "loss": 0.40757080912590027, + "step": 21980 + }, + { + "ce_loss": 0.029884718358516693, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "distill_loss": 0.2894759476184845, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "ref_ce_loss": 0.05772984027862549, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "loss": 0.3815244436264038, + "step": 21980 + }, + { + "ce_loss": 0.024049028754234314, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "distill_loss": 0.1359594762325287, + "epoch": 7.331554369579719, + "step": 21980 + }, + { + "epoch": 7.331554369579719, + "ref_ce_loss": 0.05487595498561859, + "step": 21980 + }, + { + "epoch": 7.334889926617745, + "loss": 0.3177, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "grad_norm": 3.9288063049316406, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "learning_rate": 5.405262963913231e-06, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "loss": 0.3045029044151306, + "step": 21990 + }, + { + "ce_loss": 0.018765542656183243, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "distill_loss": 0.19060693681240082, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "ref_ce_loss": 0.05052812024950981, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "loss": 0.37354931235313416, + "step": 21990 + }, + { + "ce_loss": 0.03803478553891182, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "distill_loss": 0.15719890594482422, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "ref_ce_loss": 0.044708650559186935, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "loss": 0.25138357281684875, + "step": 21990 + }, + { + "ce_loss": 0.03792501613497734, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "distill_loss": 0.13845518231391907, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "ref_ce_loss": 0.04259892553091049, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "loss": 0.30000004172325134, + "step": 21990 + }, + { + "ce_loss": 0.016089096665382385, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "distill_loss": 0.21034321188926697, + "epoch": 7.334889926617745, + "step": 21990 + }, + { + "epoch": 7.334889926617745, + "ref_ce_loss": 0.050738152116537094, + "step": 21990 + }, + { + "epoch": 7.33822548365577, + "loss": 0.329, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "grad_norm": 3.6049487590789795, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "learning_rate": 5.351507526518811e-06, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "loss": 0.7393406629562378, + "step": 22000 + }, + { + "ce_loss": 0.03983442857861519, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "distill_loss": 0.23520348966121674, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "ref_ce_loss": 0.04483520984649658, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "loss": 0.24591264128684998, + "step": 22000 + }, + { + "ce_loss": 0.029901692643761635, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "distill_loss": 0.15099608898162842, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "ref_ce_loss": 0.046401675790548325, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "loss": 0.15313398838043213, + "step": 22000 + }, + { + "ce_loss": 0.013905913569033146, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "distill_loss": 0.08073040097951889, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "ref_ce_loss": 0.022350676357746124, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "loss": 0.13868626952171326, + "step": 22000 + }, + { + "ce_loss": 0.0033345171250402927, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "distill_loss": 0.08276716619729996, + "epoch": 7.33822548365577, + "step": 22000 + }, + { + "epoch": 7.33822548365577, + "ref_ce_loss": 0.03727690130472183, + "step": 22000 + }, + { + "epoch": 7.3415610406937954, + "loss": 0.3285, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "grad_norm": 2.786442518234253, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "learning_rate": 5.2980158709154504e-06, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "loss": 0.20294231176376343, + "step": 22010 + }, + { + "ce_loss": 0.02882452681660652, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "distill_loss": 0.11589177697896957, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "ref_ce_loss": 0.058139339089393616, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "loss": 0.3361784517765045, + "step": 22010 + }, + { + "ce_loss": 0.0275823213160038, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "distill_loss": 0.21154193580150604, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "ref_ce_loss": 0.07388574630022049, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "loss": 0.4463978707790375, + "step": 22010 + }, + { + "ce_loss": 0.03035353124141693, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "distill_loss": 0.24757204949855804, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "ref_ce_loss": 0.054496344178915024, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "loss": 0.3473254442214966, + "step": 22010 + }, + { + "ce_loss": 0.03475075587630272, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "distill_loss": 0.27547967433929443, + "epoch": 7.3415610406937954, + "step": 22010 + }, + { + "epoch": 7.3415610406937954, + "ref_ce_loss": 0.036626383662223816, + "step": 22010 + }, + { + "epoch": 7.344896597731822, + "loss": 0.324, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "grad_norm": 2.6016852855682373, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "learning_rate": 5.244788094650887e-06, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "loss": 0.22592085599899292, + "step": 22020 + }, + { + "ce_loss": 0.006041758228093386, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "distill_loss": 0.16178952157497406, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "ref_ce_loss": 0.03367564082145691, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "loss": 0.3396591544151306, + "step": 22020 + }, + { + "ce_loss": 0.033659134060144424, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "distill_loss": 0.13412310183048248, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "ref_ce_loss": 0.044457755982875824, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "loss": 0.34963589906692505, + "step": 22020 + }, + { + "ce_loss": 0.021980000659823418, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "distill_loss": 0.2693895101547241, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "ref_ce_loss": 0.058126404881477356, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "loss": 0.3431185483932495, + "step": 22020 + }, + { + "ce_loss": 0.026764724403619766, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "distill_loss": 0.15950065851211548, + "epoch": 7.344896597731822, + "step": 22020 + }, + { + "epoch": 7.344896597731822, + "ref_ce_loss": 0.04904036968946457, + "step": 22020 + }, + { + "epoch": 7.348232154769846, + "loss": 0.3301, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "grad_norm": 2.480509042739868, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "learning_rate": 5.191824294791558e-06, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "loss": 0.5115179419517517, + "step": 22030 + }, + { + "ce_loss": 0.05640953779220581, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "distill_loss": 0.2712617814540863, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "ref_ce_loss": 0.06187128275632858, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "loss": 0.22394618391990662, + "step": 22030 + }, + { + "ce_loss": 0.013718812726438046, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "distill_loss": 0.1504044383764267, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "ref_ce_loss": 0.03647022321820259, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "loss": 0.2851472795009613, + "step": 22030 + }, + { + "ce_loss": 0.05637501925230026, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "distill_loss": 0.14488068222999573, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "ref_ce_loss": 0.05806015431880951, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "loss": 0.2885356545448303, + "step": 22030 + }, + { + "ce_loss": 0.06287986040115356, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "distill_loss": 0.15702137351036072, + "epoch": 7.348232154769846, + "step": 22030 + }, + { + "epoch": 7.348232154769846, + "ref_ce_loss": 0.06848686933517456, + "step": 22030 + }, + { + "epoch": 7.351567711807872, + "loss": 0.3485, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "grad_norm": 5.346456050872803, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "learning_rate": 5.139124567922553e-06, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "loss": 0.3215425908565521, + "step": 22040 + }, + { + "ce_loss": 0.03501839190721512, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "distill_loss": 0.25432682037353516, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "ref_ce_loss": 0.032068297266960144, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "loss": 0.24832525849342346, + "step": 22040 + }, + { + "ce_loss": 0.01957583799958229, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "distill_loss": 0.14824563264846802, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "ref_ce_loss": 0.03626695275306702, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "loss": 0.22427628934383392, + "step": 22040 + }, + { + "ce_loss": 0.023650500923395157, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "distill_loss": 0.15130510926246643, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "ref_ce_loss": 0.04904693365097046, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "loss": 0.4551049768924713, + "step": 22040 + }, + { + "ce_loss": 0.02893562614917755, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "distill_loss": 0.24910324811935425, + "epoch": 7.351567711807872, + "step": 22040 + }, + { + "epoch": 7.351567711807872, + "ref_ce_loss": 0.04241586849093437, + "step": 22040 + }, + { + "epoch": 7.354903268845897, + "loss": 0.3493, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "grad_norm": 3.2416086196899414, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "learning_rate": 5.0866890101473826e-06, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "loss": 0.2246338427066803, + "step": 22050 + }, + { + "ce_loss": 0.016258342191576958, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "distill_loss": 0.14530493319034576, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "ref_ce_loss": 0.06291282176971436, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "loss": 0.2104296237230301, + "step": 22050 + }, + { + "ce_loss": 0.026468923315405846, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "distill_loss": 0.12647053599357605, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "ref_ce_loss": 0.04710124433040619, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "loss": 0.4000055491924286, + "step": 22050 + }, + { + "ce_loss": 0.05000147968530655, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "distill_loss": 0.2810991108417511, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "ref_ce_loss": 0.042414627969264984, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "loss": 0.32178622484207153, + "step": 22050 + }, + { + "ce_loss": 0.01044121477752924, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "distill_loss": 0.21580266952514648, + "epoch": 7.354903268845897, + "step": 22050 + }, + { + "epoch": 7.354903268845897, + "ref_ce_loss": 0.0623142309486866, + "step": 22050 + }, + { + "epoch": 7.358238825883923, + "loss": 0.3604, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "grad_norm": 5.3949360847473145, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "learning_rate": 5.034517717087838e-06, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "loss": 0.25852417945861816, + "step": 22060 + }, + { + "ce_loss": 0.04160953685641289, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "distill_loss": 0.14878787100315094, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "ref_ce_loss": 0.04364071413874626, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "loss": 0.20833426713943481, + "step": 22060 + }, + { + "ce_loss": 0.01812390238046646, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "distill_loss": 0.13836698234081268, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "ref_ce_loss": 0.051717013120651245, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "loss": 0.3156091570854187, + "step": 22060 + }, + { + "ce_loss": 0.07016031444072723, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "distill_loss": 0.18352413177490234, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "ref_ce_loss": 0.041767437011003494, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "loss": 0.24763324856758118, + "step": 22060 + }, + { + "ce_loss": 0.019852010533213615, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "distill_loss": 0.1532268226146698, + "epoch": 7.358238825883923, + "step": 22060 + }, + { + "epoch": 7.358238825883923, + "ref_ce_loss": 0.044450026005506516, + "step": 22060 + }, + { + "epoch": 7.3615743829219475, + "loss": 0.3264, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "grad_norm": 3.7023305892944336, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "learning_rate": 4.98261078388375e-06, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "loss": 0.39206361770629883, + "step": 22070 + }, + { + "ce_loss": 0.0862056091427803, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "distill_loss": 0.20357592403888702, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "ref_ce_loss": 0.04326269030570984, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "loss": 0.25698763132095337, + "step": 22070 + }, + { + "ce_loss": 0.0048144906759262085, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "distill_loss": 0.10615068674087524, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "ref_ce_loss": 0.04334083944559097, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "loss": 0.5344517230987549, + "step": 22070 + }, + { + "ce_loss": 0.10412358492612839, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "distill_loss": 0.1765112578868866, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "ref_ce_loss": 0.06247731298208237, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "loss": 0.3683743476867676, + "step": 22070 + }, + { + "ce_loss": 0.030895305797457695, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "distill_loss": 0.16962184011936188, + "epoch": 7.3615743829219475, + "step": 22070 + }, + { + "epoch": 7.3615743829219475, + "ref_ce_loss": 0.04016808047890663, + "step": 22070 + }, + { + "epoch": 7.364909939959974, + "loss": 0.3374, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "grad_norm": 2.693723678588867, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "learning_rate": 4.9309683051929e-06, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "loss": 0.38984400033950806, + "step": 22080 + }, + { + "ce_loss": 0.010099812410771847, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "distill_loss": 0.17128121852874756, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "ref_ce_loss": 0.03249981254339218, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "loss": 0.29879438877105713, + "step": 22080 + }, + { + "ce_loss": 0.07127712666988373, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "distill_loss": 0.15073591470718384, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "ref_ce_loss": 0.06147244572639465, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "loss": 0.41210445761680603, + "step": 22080 + }, + { + "ce_loss": 0.02065237984061241, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "distill_loss": 0.2004043161869049, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "ref_ce_loss": 0.03282521665096283, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "loss": 1.1021695137023926, + "step": 22080 + }, + { + "ce_loss": 0.03629375249147415, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "distill_loss": 0.23419694602489471, + "epoch": 7.364909939959974, + "step": 22080 + }, + { + "epoch": 7.364909939959974, + "ref_ce_loss": 0.03863668441772461, + "step": 22080 + }, + { + "epoch": 7.368245496997998, + "loss": 0.3415, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "grad_norm": 3.0293538570404053, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "learning_rate": 4.879590375190789e-06, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "loss": 0.4525725543498993, + "step": 22090 + }, + { + "ce_loss": 0.06142522767186165, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "distill_loss": 0.23353439569473267, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "ref_ce_loss": 0.09087929874658585, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "loss": 0.2484576404094696, + "step": 22090 + }, + { + "ce_loss": 0.05308183655142784, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "distill_loss": 0.1403602808713913, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "ref_ce_loss": 0.03931579738855362, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "loss": 0.2212214171886444, + "step": 22090 + }, + { + "ce_loss": 0.006685642991214991, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "distill_loss": 0.11087189614772797, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "ref_ce_loss": 0.05651544779539108, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "loss": 0.2329418957233429, + "step": 22090 + }, + { + "ce_loss": 0.021505938842892647, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "distill_loss": 0.15570920705795288, + "epoch": 7.368245496997998, + "step": 22090 + }, + { + "epoch": 7.368245496997998, + "ref_ce_loss": 0.03630523011088371, + "step": 22090 + }, + { + "epoch": 7.3715810540360245, + "loss": 0.3003, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "grad_norm": 2.885834217071533, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "learning_rate": 4.82847708757052e-06, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "loss": 0.4009764492511749, + "step": 22100 + }, + { + "ce_loss": 0.03690299391746521, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "distill_loss": 0.17693805694580078, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "ref_ce_loss": 0.04862113296985626, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "loss": 0.36756935715675354, + "step": 22100 + }, + { + "ce_loss": 0.017550576478242874, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "distill_loss": 0.26459965109825134, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "ref_ce_loss": 0.056327540427446365, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "loss": 0.1626485288143158, + "step": 22100 + }, + { + "ce_loss": 0.005254819057881832, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "distill_loss": 0.10461093485355377, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "ref_ce_loss": 0.03123287670314312, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "loss": 0.2599450647830963, + "step": 22100 + }, + { + "ce_loss": 0.029316769912838936, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "distill_loss": 0.1351774036884308, + "epoch": 7.3715810540360245, + "step": 22100 + }, + { + "epoch": 7.3715810540360245, + "ref_ce_loss": 0.04561307653784752, + "step": 22100 + }, + { + "epoch": 7.374916611074049, + "loss": 0.3431, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "grad_norm": 4.652987957000732, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "learning_rate": 4.777628535542549e-06, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "loss": 0.38078269362449646, + "step": 22110 + }, + { + "ce_loss": 0.039288006722927094, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "distill_loss": 0.24713261425495148, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "ref_ce_loss": 0.06356287002563477, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "loss": 0.35073116421699524, + "step": 22110 + }, + { + "ce_loss": 0.04300399497151375, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "distill_loss": 0.22265471518039703, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "ref_ce_loss": 0.08492851257324219, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "loss": 0.47575217485427856, + "step": 22110 + }, + { + "ce_loss": 0.0735011100769043, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "distill_loss": 0.1917407512664795, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "ref_ce_loss": 0.07961657643318176, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "loss": 0.2863282561302185, + "step": 22110 + }, + { + "ce_loss": 0.04646497592329979, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "distill_loss": 0.1628018170595169, + "epoch": 7.374916611074049, + "step": 22110 + }, + { + "epoch": 7.374916611074049, + "ref_ce_loss": 0.052869487553834915, + "step": 22110 + }, + { + "epoch": 7.378252168112075, + "loss": 0.3514, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "grad_norm": 3.9610838890075684, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "learning_rate": 4.727044811834585e-06, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "loss": 0.2892042398452759, + "step": 22120 + }, + { + "ce_loss": 0.03907979279756546, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "distill_loss": 0.1598028540611267, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "ref_ce_loss": 0.07168059796094894, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "loss": 0.20294763147830963, + "step": 22120 + }, + { + "ce_loss": 0.009141350165009499, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "distill_loss": 0.13888469338417053, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "ref_ce_loss": 0.03615092858672142, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "loss": 0.43699246644973755, + "step": 22120 + }, + { + "ce_loss": 0.07229337841272354, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "distill_loss": 0.26068443059921265, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "ref_ce_loss": 0.05587449297308922, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "loss": 0.2592012584209442, + "step": 22120 + }, + { + "ce_loss": 0.058118805289268494, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "distill_loss": 0.1439739167690277, + "epoch": 7.378252168112075, + "step": 22120 + }, + { + "epoch": 7.378252168112075, + "ref_ce_loss": 0.037676967680454254, + "step": 22120 + }, + { + "epoch": 7.3815877251501, + "loss": 0.3062, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "grad_norm": 3.771886110305786, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "learning_rate": 4.676726008691356e-06, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "loss": 0.32454532384872437, + "step": 22130 + }, + { + "ce_loss": 0.045760221779346466, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "distill_loss": 0.16390490531921387, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "ref_ce_loss": 0.07490749657154083, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "loss": 0.275404691696167, + "step": 22130 + }, + { + "ce_loss": 0.033711016178131104, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "distill_loss": 0.12929511070251465, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "ref_ce_loss": 0.04097050428390503, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "loss": 0.5244038105010986, + "step": 22130 + }, + { + "ce_loss": 0.010484430938959122, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "distill_loss": 0.2396150678396225, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "ref_ce_loss": 0.04648149758577347, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "loss": 0.3090200424194336, + "step": 22130 + }, + { + "ce_loss": 0.03500140830874443, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "distill_loss": 0.18160416185855865, + "epoch": 7.3815877251501, + "step": 22130 + }, + { + "epoch": 7.3815877251501, + "ref_ce_loss": 0.0565800704061985, + "step": 22130 + }, + { + "epoch": 7.384923282188126, + "loss": 0.31, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "grad_norm": 2.869490623474121, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "learning_rate": 4.626672217874544e-06, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "loss": 0.21312899887561798, + "step": 22140 + }, + { + "ce_loss": 0.0147785022854805, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "distill_loss": 0.12025727331638336, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "ref_ce_loss": 0.07750562578439713, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "loss": 0.15995611250400543, + "step": 22140 + }, + { + "ce_loss": 0.010545612312853336, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "distill_loss": 0.10653532296419144, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "ref_ce_loss": 0.04272819682955742, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "loss": 0.2929874360561371, + "step": 22140 + }, + { + "ce_loss": 0.004137550480663776, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "distill_loss": 0.23183076083660126, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "ref_ce_loss": 0.04496045410633087, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "loss": 0.3128769099712372, + "step": 22140 + }, + { + "ce_loss": 0.022112419828772545, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "distill_loss": 0.22397132217884064, + "epoch": 7.384923282188126, + "step": 22140 + }, + { + "epoch": 7.384923282188126, + "ref_ce_loss": 0.054382920265197754, + "step": 22140 + }, + { + "epoch": 7.38825883922615, + "loss": 0.3406, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "grad_norm": 3.3012917041778564, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "learning_rate": 4.576883530662517e-06, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "loss": 0.4942885935306549, + "step": 22150 + }, + { + "ce_loss": 0.05201977118849754, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "distill_loss": 0.29401320219039917, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "ref_ce_loss": 0.03955161198973656, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "loss": 0.25629130005836487, + "step": 22150 + }, + { + "ce_loss": 0.03334483876824379, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "distill_loss": 0.1529396027326584, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "ref_ce_loss": 0.04761706292629242, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "loss": 0.3196744918823242, + "step": 22150 + }, + { + "ce_loss": 0.032249510288238525, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "distill_loss": 0.24185246229171753, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "ref_ce_loss": 0.03860660269856453, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "loss": 0.22365950047969818, + "step": 22150 + }, + { + "ce_loss": 0.0067135305143892765, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "distill_loss": 0.1155950278043747, + "epoch": 7.38825883922615, + "step": 22150 + }, + { + "epoch": 7.38825883922615, + "ref_ce_loss": 0.0510890819132328, + "step": 22150 + }, + { + "epoch": 7.391594396264177, + "loss": 0.3196, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "grad_norm": 3.433950185775757, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "learning_rate": 4.527360037850197e-06, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "loss": 0.3733089566230774, + "step": 22160 + }, + { + "ce_loss": 0.045424215495586395, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "distill_loss": 0.23835572600364685, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "ref_ce_loss": 0.04281904175877571, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "loss": 0.21638333797454834, + "step": 22160 + }, + { + "ce_loss": 0.03292544558644295, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "distill_loss": 0.10522749274969101, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "ref_ce_loss": 0.05115901306271553, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "loss": 0.19880634546279907, + "step": 22160 + }, + { + "ce_loss": 0.025685761123895645, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "distill_loss": 0.09403679519891739, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "ref_ce_loss": 0.030222853645682335, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "loss": 0.37925201654434204, + "step": 22160 + }, + { + "ce_loss": 0.024915773421525955, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "distill_loss": 0.2479330599308014, + "epoch": 7.391594396264177, + "step": 22160 + }, + { + "epoch": 7.391594396264177, + "ref_ce_loss": 0.0947517454624176, + "step": 22160 + }, + { + "epoch": 7.394929953302201, + "loss": 0.3212, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "grad_norm": 2.706461191177368, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "learning_rate": 4.4781018297488755e-06, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "loss": 0.3403114676475525, + "step": 22170 + }, + { + "ce_loss": 0.030661532655358315, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "distill_loss": 0.21051430702209473, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "ref_ce_loss": 0.042628444731235504, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "loss": 0.27130600810050964, + "step": 22170 + }, + { + "ce_loss": 0.042654745280742645, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "distill_loss": 0.14102618396282196, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "ref_ce_loss": 0.060014378279447556, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "loss": 0.3314165472984314, + "step": 22170 + }, + { + "ce_loss": 0.026579681783914566, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "distill_loss": 0.19740203022956848, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "ref_ce_loss": 0.04624663665890694, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "loss": 0.1601991355419159, + "step": 22170 + }, + { + "ce_loss": 0.010934227146208286, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "distill_loss": 0.10959579795598984, + "epoch": 7.394929953302201, + "step": 22170 + }, + { + "epoch": 7.394929953302201, + "ref_ce_loss": 0.026321228593587875, + "step": 22170 + }, + { + "epoch": 7.398265510340227, + "loss": 0.3253, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "grad_norm": 3.5024940967559814, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "learning_rate": 4.429108996186115e-06, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "loss": 0.3466987609863281, + "step": 22180 + }, + { + "ce_loss": 0.018985463306307793, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "distill_loss": 0.19944104552268982, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "ref_ce_loss": 0.06336013227701187, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "loss": 0.38137781620025635, + "step": 22180 + }, + { + "ce_loss": 0.07608304917812347, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "distill_loss": 0.15607169270515442, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "ref_ce_loss": 0.06901339441537857, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "loss": 0.34570735692977905, + "step": 22180 + }, + { + "ce_loss": 0.04311949387192726, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "distill_loss": 0.202956423163414, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "ref_ce_loss": 0.04889204725623131, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "loss": 0.30262553691864014, + "step": 22180 + }, + { + "ce_loss": 0.027611130848526955, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "distill_loss": 0.1811426728963852, + "epoch": 7.398265510340227, + "step": 22180 + }, + { + "epoch": 7.398265510340227, + "ref_ce_loss": 0.06274111568927765, + "step": 22180 + }, + { + "epoch": 7.401601067378252, + "loss": 0.3435, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "grad_norm": 2.390714645385742, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "learning_rate": 4.380381626505514e-06, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "loss": 0.1738487333059311, + "step": 22190 + }, + { + "ce_loss": 0.012540126219391823, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "distill_loss": 0.10962365567684174, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "ref_ce_loss": 0.05156319588422775, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "loss": 0.4952594041824341, + "step": 22190 + }, + { + "ce_loss": 0.04229583218693733, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "distill_loss": 0.2744204103946686, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "ref_ce_loss": 0.07085690647363663, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "loss": 0.2841060161590576, + "step": 22190 + }, + { + "ce_loss": 0.007605770602822304, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "distill_loss": 0.20285113155841827, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "ref_ce_loss": 0.04578419402241707, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "loss": 0.3293168246746063, + "step": 22190 + }, + { + "ce_loss": 0.05847109109163284, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "distill_loss": 0.204945370554924, + "epoch": 7.401601067378252, + "step": 22190 + }, + { + "epoch": 7.401601067378252, + "ref_ce_loss": 0.03360971435904503, + "step": 22190 + }, + { + "epoch": 7.404936624416278, + "loss": 0.3205, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "grad_norm": 2.24493670463562, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "learning_rate": 4.3319198095665915e-06, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "loss": 0.18313585221767426, + "step": 22200 + }, + { + "ce_loss": 0.03454110771417618, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "distill_loss": 0.10332349687814713, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "ref_ce_loss": 0.0450221411883831, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "loss": 0.34247058629989624, + "step": 22200 + }, + { + "ce_loss": 0.07478012144565582, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "distill_loss": 0.2062947154045105, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "ref_ce_loss": 0.06096559762954712, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "loss": 0.26020678877830505, + "step": 22200 + }, + { + "ce_loss": 0.05754929408431053, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "distill_loss": 0.15126144886016846, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "ref_ce_loss": 0.02520419843494892, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "loss": 0.48407548666000366, + "step": 22200 + }, + { + "ce_loss": 0.06056468188762665, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "distill_loss": 0.2092147171497345, + "epoch": 7.404936624416278, + "step": 22200 + }, + { + "epoch": 7.404936624416278, + "ref_ce_loss": 0.10852415859699249, + "step": 22200 + }, + { + "epoch": 7.408272181454302, + "loss": 0.3367, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "grad_norm": 3.42170786857605, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "learning_rate": 4.283723633744557e-06, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "loss": 0.1436109095811844, + "step": 22210 + }, + { + "ce_loss": 0.0023982953280210495, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "distill_loss": 0.09932367503643036, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "ref_ce_loss": 0.029441095888614655, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "loss": 0.28712987899780273, + "step": 22210 + }, + { + "ce_loss": 0.01837264373898506, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "distill_loss": 0.14383924007415771, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "ref_ce_loss": 0.04897449538111687, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "loss": 0.31028202176094055, + "step": 22210 + }, + { + "ce_loss": 0.024862175807356834, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "distill_loss": 0.1748422533273697, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "ref_ce_loss": 0.055731479078531265, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "loss": 0.2121187001466751, + "step": 22210 + }, + { + "ce_loss": 0.016437353566288948, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "distill_loss": 0.13269154727458954, + "epoch": 7.408272181454302, + "step": 22210 + }, + { + "epoch": 7.408272181454302, + "ref_ce_loss": 0.04025792330503464, + "step": 22210 + }, + { + "epoch": 7.411607738492329, + "loss": 0.2707, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "grad_norm": 2.6476309299468994, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "learning_rate": 4.235793186930237e-06, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "loss": 0.2639855146408081, + "step": 22220 + }, + { + "ce_loss": 0.042686786502599716, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "distill_loss": 0.17029103636741638, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "ref_ce_loss": 0.050944022834300995, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "loss": 0.4206048548221588, + "step": 22220 + }, + { + "ce_loss": 0.03519580513238907, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "distill_loss": 0.3233287036418915, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "ref_ce_loss": 0.061676353216171265, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "loss": 0.5416637659072876, + "step": 22220 + }, + { + "ce_loss": 0.03354215249419212, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "distill_loss": 0.13695743680000305, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "ref_ce_loss": 0.03787124529480934, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "loss": 0.3412051200866699, + "step": 22220 + }, + { + "ce_loss": 0.01873054727911949, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "distill_loss": 0.2385825216770172, + "epoch": 7.411607738492329, + "step": 22220 + }, + { + "epoch": 7.411607738492329, + "ref_ce_loss": 0.04805649444460869, + "step": 22220 + }, + { + "epoch": 7.414943295530353, + "loss": 0.3411, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "grad_norm": 2.7999589443206787, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "learning_rate": 4.188128556529846e-06, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "loss": 0.26248857378959656, + "step": 22230 + }, + { + "ce_loss": 0.017941398546099663, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "distill_loss": 0.17652080953121185, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "ref_ce_loss": 0.028682641685009003, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "loss": 0.18539687991142273, + "step": 22230 + }, + { + "ce_loss": 0.026324966922402382, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "distill_loss": 0.11990171670913696, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "ref_ce_loss": 0.038930267095565796, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "loss": 0.6258493065834045, + "step": 22230 + }, + { + "ce_loss": 0.06917796283960342, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "distill_loss": 0.18112300336360931, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "ref_ce_loss": 0.07729977369308472, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "loss": 0.3908163905143738, + "step": 22230 + }, + { + "ce_loss": 0.08195307105779648, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "distill_loss": 0.18970994651317596, + "epoch": 7.414943295530353, + "step": 22230 + }, + { + "epoch": 7.414943295530353, + "ref_ce_loss": 0.052481528371572495, + "step": 22230 + }, + { + "epoch": 7.418278852568379, + "loss": 0.3054, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "grad_norm": 2.6394460201263428, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "learning_rate": 4.1407298294649064e-06, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "loss": 0.3083247244358063, + "step": 22240 + }, + { + "ce_loss": 0.023212047293782234, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "distill_loss": 0.19049547612667084, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "ref_ce_loss": 0.07450911402702332, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "loss": 0.27266043424606323, + "step": 22240 + }, + { + "ce_loss": 0.048310212790966034, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "distill_loss": 0.15052545070648193, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "ref_ce_loss": 0.06026710197329521, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "loss": 0.20550397038459778, + "step": 22240 + }, + { + "ce_loss": 0.002684173174202442, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "distill_loss": 0.157025545835495, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "ref_ce_loss": 0.045708607882261276, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "loss": 0.3732839822769165, + "step": 22240 + }, + { + "ce_loss": 0.048869553953409195, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "distill_loss": 0.26047688722610474, + "epoch": 7.418278852568379, + "step": 22240 + }, + { + "epoch": 7.418278852568379, + "ref_ce_loss": 0.04291224107146263, + "step": 22240 + }, + { + "epoch": 7.421614409606404, + "loss": 0.338, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "grad_norm": 4.603501319885254, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "learning_rate": 4.093597092171941e-06, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "loss": 0.4850652813911438, + "step": 22250 + }, + { + "ce_loss": 0.014903169125318527, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "distill_loss": 0.15682287514209747, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "ref_ce_loss": 0.05811598151922226, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "loss": 0.29419952630996704, + "step": 22250 + }, + { + "ce_loss": 0.01383958663791418, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "distill_loss": 0.16313058137893677, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "ref_ce_loss": 0.032282061874866486, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "loss": 0.3315328359603882, + "step": 22250 + }, + { + "ce_loss": 0.0404106043279171, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "distill_loss": 0.19743554294109344, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "ref_ce_loss": 0.07366403937339783, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "loss": 0.29002076387405396, + "step": 22250 + }, + { + "ce_loss": 0.024073943495750427, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "distill_loss": 0.1868630051612854, + "epoch": 7.421614409606404, + "step": 22250 + }, + { + "epoch": 7.421614409606404, + "ref_ce_loss": 0.04270247742533684, + "step": 22250 + }, + { + "epoch": 7.42494996664443, + "loss": 0.3519, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "grad_norm": 2.3166913986206055, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "learning_rate": 4.0467304306025125e-06, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "loss": 0.3277677595615387, + "step": 22260 + }, + { + "ce_loss": 0.021930452436208725, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "distill_loss": 0.16001535952091217, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "ref_ce_loss": 0.050045643001794815, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "loss": 0.40044260025024414, + "step": 22260 + }, + { + "ce_loss": 0.03517067804932594, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "distill_loss": 0.16368120908737183, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "ref_ce_loss": 0.05305195599794388, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "loss": 0.38687756657600403, + "step": 22260 + }, + { + "ce_loss": 0.06107117980718613, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "distill_loss": 0.1886368691921234, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "ref_ce_loss": 0.06111163645982742, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "loss": 0.3298104405403137, + "step": 22260 + }, + { + "ce_loss": 0.050029948353767395, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "distill_loss": 0.13902169466018677, + "epoch": 7.42494996664443, + "step": 22260 + }, + { + "epoch": 7.42494996664443, + "ref_ce_loss": 0.056891169399023056, + "step": 22260 + }, + { + "epoch": 7.4282855236824545, + "loss": 0.3252, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "grad_norm": 2.7258355617523193, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "learning_rate": 4.000129930222906e-06, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "loss": 0.2858698070049286, + "step": 22270 + }, + { + "ce_loss": 0.019824549555778503, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "distill_loss": 0.15271005034446716, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "ref_ce_loss": 0.06389021873474121, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "loss": 0.2244950234889984, + "step": 22270 + }, + { + "ce_loss": 0.01788237690925598, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "distill_loss": 0.11896365880966187, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "ref_ce_loss": 0.053938981145620346, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "loss": 0.2784402370452881, + "step": 22270 + }, + { + "ce_loss": 0.02266446314752102, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "distill_loss": 0.12554806470870972, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "ref_ce_loss": 0.056586507707834244, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "loss": 0.2199580818414688, + "step": 22270 + }, + { + "ce_loss": 0.01635870523750782, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "distill_loss": 0.16090208292007446, + "epoch": 7.4282855236824545, + "step": 22270 + }, + { + "epoch": 7.4282855236824545, + "ref_ce_loss": 0.03102097287774086, + "step": 22270 + }, + { + "epoch": 7.431621080720481, + "loss": 0.2772, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "grad_norm": 3.1755130290985107, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "learning_rate": 3.95379567601406e-06, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "loss": 0.25050878524780273, + "step": 22280 + }, + { + "ce_loss": 0.041921354830265045, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "distill_loss": 0.13335081934928894, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "ref_ce_loss": 0.05765026807785034, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "loss": 0.24195145070552826, + "step": 22280 + }, + { + "ce_loss": 0.06510787457227707, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "distill_loss": 0.13477462530136108, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "ref_ce_loss": 0.03168313577771187, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "loss": 0.3614014983177185, + "step": 22280 + }, + { + "ce_loss": 0.06072500720620155, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "distill_loss": 0.2287401407957077, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "ref_ce_loss": 0.07151369005441666, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "loss": 0.24856488406658173, + "step": 22280 + }, + { + "ce_loss": 0.009639018215239048, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "distill_loss": 0.16313186287879944, + "epoch": 7.431621080720481, + "step": 22280 + }, + { + "epoch": 7.431621080720481, + "ref_ce_loss": 0.048035651445388794, + "step": 22280 + }, + { + "epoch": 7.434956637758505, + "loss": 0.3219, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "grad_norm": 2.436527729034424, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "learning_rate": 3.9077277524714015e-06, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "loss": 0.2814217805862427, + "step": 22290 + }, + { + "ce_loss": 0.03163493424654007, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "distill_loss": 0.14511892199516296, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "ref_ce_loss": 0.033653974533081055, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "loss": 0.22175005078315735, + "step": 22290 + }, + { + "ce_loss": 0.03425311669707298, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "distill_loss": 0.132270947098732, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "ref_ce_loss": 0.03548649325966835, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "loss": 0.2341788113117218, + "step": 22290 + }, + { + "ce_loss": 0.017922593280673027, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "distill_loss": 0.18605180084705353, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "ref_ce_loss": 0.030135784298181534, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "loss": 0.24607765674591064, + "step": 22290 + }, + { + "ce_loss": 0.022272905334830284, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "distill_loss": 0.12093356996774673, + "epoch": 7.434956637758505, + "step": 22290 + }, + { + "epoch": 7.434956637758505, + "ref_ce_loss": 0.03579564765095711, + "step": 22290 + }, + { + "epoch": 7.4382921947965315, + "loss": 0.3158, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "grad_norm": 3.7742974758148193, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "learning_rate": 3.861926243604596e-06, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "loss": 0.14824345707893372, + "step": 22300 + }, + { + "ce_loss": 0.00874150637537241, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "distill_loss": 0.0849747583270073, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "ref_ce_loss": 0.04229980707168579, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "loss": 0.25204765796661377, + "step": 22300 + }, + { + "ce_loss": 0.06930907815694809, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "distill_loss": 0.13660091161727905, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "ref_ce_loss": 0.046087075024843216, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "loss": 0.2791946530342102, + "step": 22300 + }, + { + "ce_loss": 0.0685519129037857, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "distill_loss": 0.1319180130958557, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "ref_ce_loss": 0.054504599422216415, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "loss": 0.2951914072036743, + "step": 22300 + }, + { + "ce_loss": 0.009827563539147377, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "distill_loss": 0.21003487706184387, + "epoch": 7.4382921947965315, + "step": 22300 + }, + { + "epoch": 7.4382921947965315, + "ref_ce_loss": 0.04733233526349068, + "step": 22300 + }, + { + "epoch": 7.441627751834556, + "loss": 0.3302, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "grad_norm": 4.366042137145996, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "learning_rate": 3.816391232937549e-06, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "loss": 0.33673936128616333, + "step": 22310 + }, + { + "ce_loss": 0.01169640477746725, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "distill_loss": 0.1271243542432785, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "ref_ce_loss": 0.067172110080719, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "loss": 0.2582665681838989, + "step": 22310 + }, + { + "ce_loss": 0.043444760143756866, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "distill_loss": 0.13749757409095764, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "ref_ce_loss": 0.061841681599617004, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "loss": 0.23331327736377716, + "step": 22310 + }, + { + "ce_loss": 0.015050852671265602, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "distill_loss": 0.11916429549455643, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "ref_ce_loss": 0.047242291271686554, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "loss": 0.278873085975647, + "step": 22310 + }, + { + "ce_loss": 0.02831905707716942, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "distill_loss": 0.17653679847717285, + "epoch": 7.441627751834556, + "step": 22310 + }, + { + "epoch": 7.441627751834556, + "ref_ce_loss": 0.03725513815879822, + "step": 22310 + }, + { + "epoch": 7.444963308872582, + "loss": 0.3126, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "grad_norm": 2.6882822513580322, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "learning_rate": 3.7711228035081863e-06, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "loss": 0.16673119366168976, + "step": 22320 + }, + { + "ce_loss": 0.0011848431313410401, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "distill_loss": 0.13170509040355682, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "ref_ce_loss": 0.018590757623314857, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "loss": 0.27042847871780396, + "step": 22320 + }, + { + "ce_loss": 0.030981097370386124, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "distill_loss": 0.12473780661821365, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "ref_ce_loss": 0.03478186950087547, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "loss": 0.16557539999485016, + "step": 22320 + }, + { + "ce_loss": 0.005286859814077616, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "distill_loss": 0.10098185390233994, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "ref_ce_loss": 0.028873972594738007, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "loss": 0.2681327760219574, + "step": 22320 + }, + { + "ce_loss": 0.028545308858156204, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "distill_loss": 0.16974365711212158, + "epoch": 7.444963308872582, + "step": 22320 + }, + { + "epoch": 7.444963308872582, + "ref_ce_loss": 0.029505878686904907, + "step": 22320 + }, + { + "epoch": 7.448298865910607, + "loss": 0.2978, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "grad_norm": 3.4482717514038086, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "learning_rate": 3.7261210378682238e-06, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "loss": 0.38903453946113586, + "step": 22330 + }, + { + "ce_loss": 0.056097060441970825, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "distill_loss": 0.23296581208705902, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "ref_ce_loss": 0.06274658441543579, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "loss": 0.36393213272094727, + "step": 22330 + }, + { + "ce_loss": 0.06572960317134857, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "distill_loss": 0.2237122654914856, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "ref_ce_loss": 0.05479707568883896, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "loss": 0.19030554592609406, + "step": 22330 + }, + { + "ce_loss": 0.0062269787304103374, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "distill_loss": 0.1253552883863449, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "ref_ce_loss": 0.03615328297019005, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "loss": 0.5729780793190002, + "step": 22330 + }, + { + "ce_loss": 0.09128619730472565, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "distill_loss": 0.1655218005180359, + "epoch": 7.448298865910607, + "step": 22330 + }, + { + "epoch": 7.448298865910607, + "ref_ce_loss": 0.07864798605442047, + "step": 22330 + }, + { + "epoch": 7.451634422948633, + "loss": 0.3321, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "grad_norm": 4.633417129516602, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "learning_rate": 3.6813860180831824e-06, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "loss": 0.20244018733501434, + "step": 22340 + }, + { + "ce_loss": 0.01871795393526554, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "distill_loss": 0.12031986564397812, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "ref_ce_loss": 0.036138541996479034, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "loss": 0.2242555469274521, + "step": 22340 + }, + { + "ce_loss": 0.03619326651096344, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "distill_loss": 0.1221376582980156, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "ref_ce_loss": 0.04511701688170433, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "loss": 0.19007287919521332, + "step": 22340 + }, + { + "ce_loss": 0.0217959713190794, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "distill_loss": 0.12164486944675446, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "ref_ce_loss": 0.04645758494734764, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "loss": 0.3102092146873474, + "step": 22340 + }, + { + "ce_loss": 0.041494760662317276, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "distill_loss": 0.11831970512866974, + "epoch": 7.451634422948633, + "step": 22340 + }, + { + "epoch": 7.451634422948633, + "ref_ce_loss": 0.05176829546689987, + "step": 22340 + }, + { + "epoch": 7.454969979986657, + "loss": 0.2923, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "grad_norm": 3.0117533206939697, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "learning_rate": 3.6369178257320385e-06, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "loss": 0.4076274633407593, + "step": 22350 + }, + { + "ce_loss": 0.047179240733385086, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "distill_loss": 0.28673622012138367, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "ref_ce_loss": 0.0505196787416935, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "loss": 0.20299524068832397, + "step": 22350 + }, + { + "ce_loss": 0.016069624572992325, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "distill_loss": 0.12883426249027252, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "ref_ce_loss": 0.057666610926389694, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "loss": 0.198039710521698, + "step": 22350 + }, + { + "ce_loss": 0.013000822626054287, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "distill_loss": 0.13102981448173523, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "ref_ce_loss": 0.035978831350803375, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "loss": 0.514954149723053, + "step": 22350 + }, + { + "ce_loss": 0.025409677997231483, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "distill_loss": 0.1583758145570755, + "epoch": 7.454969979986657, + "step": 22350 + }, + { + "epoch": 7.454969979986657, + "ref_ce_loss": 0.07159221172332764, + "step": 22350 + }, + { + "epoch": 7.458305537024684, + "loss": 0.3204, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "grad_norm": 4.352915287017822, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "learning_rate": 3.592716541907259e-06, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "loss": 0.21900595724582672, + "step": 22360 + }, + { + "ce_loss": 0.026947587728500366, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "distill_loss": 0.13711225986480713, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "ref_ce_loss": 0.03863257169723511, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "loss": 0.3650367259979248, + "step": 22360 + }, + { + "ce_loss": 0.06087297573685646, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "distill_loss": 0.24326029419898987, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "ref_ce_loss": 0.03927703574299812, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "loss": 0.19165660440921783, + "step": 22360 + }, + { + "ce_loss": 0.02028833143413067, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "distill_loss": 0.12279950082302094, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "ref_ce_loss": 0.048306904733181, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "loss": 0.23307588696479797, + "step": 22360 + }, + { + "ce_loss": 0.024532662704586983, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "distill_loss": 0.13930636644363403, + "epoch": 7.458305537024684, + "step": 22360 + }, + { + "epoch": 7.458305537024684, + "ref_ce_loss": 0.03750590234994888, + "step": 22360 + }, + { + "epoch": 7.461641094062708, + "loss": 0.3426, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "grad_norm": 5.301361083984375, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "learning_rate": 3.5487822472145487e-06, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "loss": 0.3800077438354492, + "step": 22370 + }, + { + "ce_loss": 0.06131187081336975, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "distill_loss": 0.17494143545627594, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "ref_ce_loss": 0.06068550422787666, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "loss": 0.19597984850406647, + "step": 22370 + }, + { + "ce_loss": 0.03716249763965607, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "distill_loss": 0.11552690714597702, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "ref_ce_loss": 0.043000392615795135, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "loss": 0.32776740193367004, + "step": 22370 + }, + { + "ce_loss": 0.04577228054404259, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "distill_loss": 0.15617133677005768, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "ref_ce_loss": 0.051974330097436905, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "loss": 0.2957746088504791, + "step": 22370 + }, + { + "ce_loss": 0.02486003190279007, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "distill_loss": 0.19778414070606232, + "epoch": 7.461641094062708, + "step": 22370 + }, + { + "epoch": 7.461641094062708, + "ref_ce_loss": 0.04370433837175369, + "step": 22370 + }, + { + "epoch": 7.464976651100734, + "loss": 0.3297, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "grad_norm": 2.884024143218994, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "learning_rate": 3.5051150217727197e-06, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "loss": 0.31238073110580444, + "step": 22380 + }, + { + "ce_loss": 0.05873372033238411, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "distill_loss": 0.1443156749010086, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "ref_ce_loss": 0.054802797734737396, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "loss": 0.648821234703064, + "step": 22380 + }, + { + "ce_loss": 0.030163580551743507, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "distill_loss": 0.17426523566246033, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "ref_ce_loss": 0.06576082855463028, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "loss": 0.34878626465797424, + "step": 22380 + }, + { + "ce_loss": 0.0797041729092598, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "distill_loss": 0.192304790019989, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "ref_ce_loss": 0.03504239022731781, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "loss": 0.14502675831317902, + "step": 22380 + }, + { + "ce_loss": 0.003223717911168933, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "distill_loss": 0.1210360899567604, + "epoch": 7.464976651100734, + "step": 22380 + }, + { + "epoch": 7.464976651100734, + "ref_ce_loss": 0.020702652633190155, + "step": 22380 + }, + { + "epoch": 7.468312208138759, + "loss": 0.3046, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "grad_norm": 2.712825059890747, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "learning_rate": 3.4617149452135897e-06, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "loss": 0.31714093685150146, + "step": 22390 + }, + { + "ce_loss": 0.02619621716439724, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "distill_loss": 0.15785712003707886, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "ref_ce_loss": 0.0551065132021904, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "loss": 0.24565093219280243, + "step": 22390 + }, + { + "ce_loss": 0.01495307870209217, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "distill_loss": 0.08529633283615112, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "ref_ce_loss": 0.041590429842472076, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "loss": 0.4593181312084198, + "step": 22390 + }, + { + "ce_loss": 0.02708379551768303, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "distill_loss": 0.15476664900779724, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "ref_ce_loss": 0.06588542461395264, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "loss": 0.42947351932525635, + "step": 22390 + }, + { + "ce_loss": 0.05233415216207504, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "distill_loss": 0.19729653000831604, + "epoch": 7.468312208138759, + "step": 22390 + }, + { + "epoch": 7.468312208138759, + "ref_ce_loss": 0.04568365216255188, + "step": 22390 + }, + { + "epoch": 7.471647765176785, + "loss": 0.319, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "grad_norm": 2.807758092880249, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "learning_rate": 3.418582096681766e-06, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "loss": 0.2579874098300934, + "step": 22400 + }, + { + "ce_loss": 0.01579379104077816, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "distill_loss": 0.12108300626277924, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "ref_ce_loss": 0.04175504297018051, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "loss": 0.3060038089752197, + "step": 22400 + }, + { + "ce_loss": 0.02407224290072918, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "distill_loss": 0.18588589131832123, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "ref_ce_loss": 0.04403572157025337, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "loss": 0.20720121264457703, + "step": 22400 + }, + { + "ce_loss": 0.027171071618795395, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "distill_loss": 0.10979503393173218, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "ref_ce_loss": 0.057181816548109055, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "loss": 0.24890443682670593, + "step": 22400 + }, + { + "ce_loss": 0.0304836668074131, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "distill_loss": 0.12231708317995071, + "epoch": 7.471647765176785, + "step": 22400 + }, + { + "epoch": 7.471647765176785, + "ref_ce_loss": 0.04766134172677994, + "step": 22400 + }, + { + "epoch": 7.474983322214809, + "loss": 0.3347, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "grad_norm": 2.27168345451355, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "learning_rate": 3.375716554834529e-06, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "loss": 0.22012270987033844, + "step": 22410 + }, + { + "ce_loss": 0.010555864311754704, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "distill_loss": 0.14049920439720154, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "ref_ce_loss": 0.03784632682800293, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "loss": 0.21688060462474823, + "step": 22410 + }, + { + "ce_loss": 0.003909058403223753, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "distill_loss": 0.1273895502090454, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "ref_ce_loss": 0.06180055812001228, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "loss": 0.6481152772903442, + "step": 22410 + }, + { + "ce_loss": 0.07658793032169342, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "distill_loss": 0.22413767874240875, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "ref_ce_loss": 0.0503942035138607, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "loss": 0.4268360733985901, + "step": 22410 + }, + { + "ce_loss": 0.046731140464544296, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "distill_loss": 0.11381864547729492, + "epoch": 7.474983322214809, + "step": 22410 + }, + { + "epoch": 7.474983322214809, + "ref_ce_loss": 0.03432987257838249, + "step": 22410 + }, + { + "epoch": 7.478318879252836, + "loss": 0.3312, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "grad_norm": 3.638110399246216, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "learning_rate": 3.3331183978417496e-06, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "loss": 0.20778049528598785, + "step": 22420 + }, + { + "ce_loss": 0.00603119982406497, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "distill_loss": 0.16420957446098328, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "ref_ce_loss": 0.03743180260062218, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "loss": 0.20351877808570862, + "step": 22420 + }, + { + "ce_loss": 0.001148868934251368, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "distill_loss": 0.11724899709224701, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "ref_ce_loss": 0.052420634776353836, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "loss": 0.2200160175561905, + "step": 22420 + }, + { + "ce_loss": 0.043612752109766006, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "distill_loss": 0.12745922803878784, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "ref_ce_loss": 0.03912469744682312, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "loss": 0.2272098958492279, + "step": 22420 + }, + { + "ce_loss": 0.023629697039723396, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "distill_loss": 0.15925733745098114, + "epoch": 7.478318879252836, + "step": 22420 + }, + { + "epoch": 7.478318879252836, + "ref_ce_loss": 0.02677595056593418, + "step": 22420 + }, + { + "epoch": 7.48165443629086, + "loss": 0.2968, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "grad_norm": 2.6959264278411865, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "learning_rate": 3.2907877033856387e-06, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "loss": 0.32253384590148926, + "step": 22430 + }, + { + "ce_loss": 0.06485221534967422, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "distill_loss": 0.17065764963626862, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "ref_ce_loss": 0.042805016040802, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "loss": 0.3148137629032135, + "step": 22430 + }, + { + "ce_loss": 0.022289402782917023, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "distill_loss": 0.09498833119869232, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "ref_ce_loss": 0.07857701182365417, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "loss": 0.19593794643878937, + "step": 22430 + }, + { + "ce_loss": 0.0311006810516119, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "distill_loss": 0.10990280658006668, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "ref_ce_loss": 0.03655588626861572, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "loss": 0.23988619446754456, + "step": 22430 + }, + { + "ce_loss": 0.049732793122529984, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "distill_loss": 0.10762669891119003, + "epoch": 7.48165443629086, + "step": 22430 + }, + { + "epoch": 7.48165443629086, + "ref_ce_loss": 0.043731361627578735, + "step": 22430 + }, + { + "epoch": 7.484989993328886, + "loss": 0.3048, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "grad_norm": 3.093358278274536, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "learning_rate": 3.2487245486607137e-06, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "loss": 0.29418665170669556, + "step": 22440 + }, + { + "ce_loss": 0.01880057156085968, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "distill_loss": 0.0965670645236969, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "ref_ce_loss": 0.03197801858186722, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "loss": 0.5496841073036194, + "step": 22440 + }, + { + "ce_loss": 0.0656195804476738, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "distill_loss": 0.3139565587043762, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "ref_ce_loss": 0.08307035267353058, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "loss": 0.32252851128578186, + "step": 22440 + }, + { + "ce_loss": 0.03442087024450302, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "distill_loss": 0.24391041696071625, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "ref_ce_loss": 0.04416259750723839, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "loss": 0.6654932498931885, + "step": 22440 + }, + { + "ce_loss": 0.01959805190563202, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "distill_loss": 0.22524727880954742, + "epoch": 7.484989993328886, + "step": 22440 + }, + { + "epoch": 7.484989993328886, + "ref_ce_loss": 0.08550272136926651, + "step": 22440 + }, + { + "epoch": 7.488325550366911, + "loss": 0.3117, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "grad_norm": 3.314997434616089, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "learning_rate": 3.206929010373549e-06, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "loss": 0.2675893306732178, + "step": 22450 + }, + { + "ce_loss": 0.001301237614825368, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "distill_loss": 0.1589747667312622, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "ref_ce_loss": 0.03607611358165741, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "loss": 0.2627130150794983, + "step": 22450 + }, + { + "ce_loss": 0.03996428847312927, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "distill_loss": 0.1684572398662567, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "ref_ce_loss": 0.03458402305841446, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "loss": 0.21059346199035645, + "step": 22450 + }, + { + "ce_loss": 0.0070762732066214085, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "distill_loss": 0.10879668593406677, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "ref_ce_loss": 0.05947238579392433, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "loss": 0.3560434579849243, + "step": 22450 + }, + { + "ce_loss": 0.028698332607746124, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "distill_loss": 0.13503122329711914, + "epoch": 7.488325550366911, + "step": 22450 + }, + { + "epoch": 7.488325550366911, + "ref_ce_loss": 0.072860948741436, + "step": 22450 + }, + { + "epoch": 7.491661107404937, + "loss": 0.3396, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "grad_norm": 3.122793674468994, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "learning_rate": 3.165401164742709e-06, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "loss": 0.6378544569015503, + "step": 22460 + }, + { + "ce_loss": 0.030728649348020554, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "distill_loss": 0.4074413776397705, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "ref_ce_loss": 0.05243341997265816, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "loss": 0.22806666791439056, + "step": 22460 + }, + { + "ce_loss": 0.033266473561525345, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "distill_loss": 0.11113715171813965, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "ref_ce_loss": 0.053928226232528687, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "loss": 0.2969411313533783, + "step": 22460 + }, + { + "ce_loss": 0.0053944881074130535, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "distill_loss": 0.17704196274280548, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "ref_ce_loss": 0.04825448989868164, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "loss": 0.314153790473938, + "step": 22460 + }, + { + "ce_loss": 0.031788185238838196, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "distill_loss": 0.19077731668949127, + "epoch": 7.491661107404937, + "step": 22460 + }, + { + "epoch": 7.491661107404937, + "ref_ce_loss": 0.0667947381734848, + "step": 22460 + }, + { + "epoch": 7.4949966644429615, + "loss": 0.3497, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "grad_norm": 3.5940418243408203, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "learning_rate": 3.1241410874986495e-06, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "loss": 0.38101842999458313, + "step": 22470 + }, + { + "ce_loss": 0.02718617394566536, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "distill_loss": 0.16715385019779205, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "ref_ce_loss": 0.05261871591210365, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "loss": 0.3260224163532257, + "step": 22470 + }, + { + "ce_loss": 0.04991668462753296, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "distill_loss": 0.13207031786441803, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "ref_ce_loss": 0.0459817498922348, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "loss": 0.21622683107852936, + "step": 22470 + }, + { + "ce_loss": 0.021841280162334442, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "distill_loss": 0.14691105484962463, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "ref_ce_loss": 0.033241719007492065, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "loss": 0.42072075605392456, + "step": 22470 + }, + { + "ce_loss": 0.04634197801351547, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "distill_loss": 0.16500091552734375, + "epoch": 7.4949966644429615, + "step": 22470 + }, + { + "epoch": 7.4949966644429615, + "ref_ce_loss": 0.05034958943724632, + "step": 22470 + }, + { + "epoch": 7.498332221480988, + "loss": 0.3099, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "grad_norm": 2.503765821456909, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "learning_rate": 3.0831488538834328e-06, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "loss": 0.24750734865665436, + "step": 22480 + }, + { + "ce_loss": 0.017951542511582375, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "distill_loss": 0.17765724658966064, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "ref_ce_loss": 0.04299882426857948, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "loss": 0.530583918094635, + "step": 22480 + }, + { + "ce_loss": 0.05396854504942894, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "distill_loss": 0.19391027092933655, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "ref_ce_loss": 0.03469100221991539, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "loss": 0.3143228590488434, + "step": 22480 + }, + { + "ce_loss": 0.003916238900274038, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "distill_loss": 0.23221957683563232, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "ref_ce_loss": 0.04830393195152283, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "loss": 0.2439468801021576, + "step": 22480 + }, + { + "ce_loss": 0.03384243696928024, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "distill_loss": 0.12823912501335144, + "epoch": 7.498332221480988, + "step": 22480 + }, + { + "epoch": 7.498332221480988, + "ref_ce_loss": 0.05655447021126747, + "step": 22480 + }, + { + "epoch": 7.501667778519012, + "loss": 0.3374, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "grad_norm": 3.2659122943878174, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "learning_rate": 3.0424245386507286e-06, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "loss": 0.1496151238679886, + "step": 22490 + }, + { + "ce_loss": 0.005388977937400341, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "distill_loss": 0.10397054255008698, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "ref_ce_loss": 0.040133412927389145, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "loss": 0.41377997398376465, + "step": 22490 + }, + { + "ce_loss": 0.05353647097945213, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "distill_loss": 0.21046878397464752, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "ref_ce_loss": 0.05055622756481171, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "loss": 0.298939049243927, + "step": 22490 + }, + { + "ce_loss": 0.07888401299715042, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "distill_loss": 0.13656878471374512, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "ref_ce_loss": 0.05380229651927948, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "loss": 0.2905265688896179, + "step": 22490 + }, + { + "ce_loss": 0.022800395265221596, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "distill_loss": 0.1571730673313141, + "epoch": 7.501667778519012, + "step": 22490 + }, + { + "epoch": 7.501667778519012, + "ref_ce_loss": 0.05390867590904236, + "step": 22490 + }, + { + "epoch": 7.5050033355570385, + "loss": 0.321, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "grad_norm": 3.711552381515503, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "learning_rate": 3.0019682160656642e-06, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "loss": 0.2089131623506546, + "step": 22500 + }, + { + "ce_loss": 0.02487696148455143, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "distill_loss": 0.13498246669769287, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "ref_ce_loss": 0.04884064197540283, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "loss": 0.5317951440811157, + "step": 22500 + }, + { + "ce_loss": 0.018521640449762344, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "distill_loss": 0.10497358441352844, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "ref_ce_loss": 0.07159274071455002, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "loss": 0.33921340107917786, + "step": 22500 + }, + { + "ce_loss": 0.060679879039525986, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "distill_loss": 0.14240986108779907, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "ref_ce_loss": 0.06749631464481354, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "loss": 0.3387434482574463, + "step": 22500 + }, + { + "ce_loss": 0.0036824748385697603, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "distill_loss": 0.14505694806575775, + "epoch": 7.5050033355570385, + "step": 22500 + }, + { + "epoch": 7.5050033355570385, + "ref_ce_loss": 0.04898282513022423, + "step": 22500 + }, + { + "epoch": 7.508338892595063, + "loss": 0.343, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "grad_norm": 3.4600985050201416, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "learning_rate": 2.9617799599045588e-06, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "loss": 0.23449355363845825, + "step": 22510 + }, + { + "ce_loss": 0.010881789028644562, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "distill_loss": 0.17654874920845032, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "ref_ce_loss": 0.02779705449938774, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "loss": 0.5081146359443665, + "step": 22510 + }, + { + "ce_loss": 0.06562300026416779, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "distill_loss": 0.2453111857175827, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "ref_ce_loss": 0.07138422876596451, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "loss": 0.17814725637435913, + "step": 22510 + }, + { + "ce_loss": 0.004538268316537142, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "distill_loss": 0.1134893000125885, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "ref_ce_loss": 0.03875197842717171, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "loss": 0.24402591586112976, + "step": 22510 + }, + { + "ce_loss": 0.023568812757730484, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "distill_loss": 0.16957291960716248, + "epoch": 7.508338892595063, + "step": 22510 + }, + { + "epoch": 7.508338892595063, + "ref_ce_loss": 0.0506262481212616, + "step": 22510 + }, + { + "epoch": 7.511674449633089, + "loss": 0.3148, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "grad_norm": 5.0570549964904785, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "learning_rate": 2.9218598434549876e-06, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "loss": 0.30400851368904114, + "step": 22520 + }, + { + "ce_loss": 0.02792799100279808, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "distill_loss": 0.2112709879875183, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "ref_ce_loss": 0.053309354931116104, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "loss": 0.2785424590110779, + "step": 22520 + }, + { + "ce_loss": 0.041298575699329376, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "distill_loss": 0.18324530124664307, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "ref_ce_loss": 0.05387912318110466, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "loss": 0.3883103132247925, + "step": 22520 + }, + { + "ce_loss": 0.010573752224445343, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "distill_loss": 0.317501962184906, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "ref_ce_loss": 0.036491647362709045, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "loss": 0.18496163189411163, + "step": 22520 + }, + { + "ce_loss": 0.01726536639034748, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "distill_loss": 0.13057807087898254, + "epoch": 7.511674449633089, + "step": 22520 + }, + { + "epoch": 7.511674449633089, + "ref_ce_loss": 0.03697721287608147, + "step": 22520 + }, + { + "epoch": 7.515010006671114, + "loss": 0.3219, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "grad_norm": 4.546923637390137, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "learning_rate": 2.882207939515435e-06, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "loss": 0.5167214274406433, + "step": 22530 + }, + { + "ce_loss": 0.02184062823653221, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "distill_loss": 0.2753063440322876, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "ref_ce_loss": 0.057973772287368774, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "loss": 0.2361687868833542, + "step": 22530 + }, + { + "ce_loss": 0.043745920062065125, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "distill_loss": 0.10686971247196198, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "ref_ce_loss": 0.05657653138041496, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "loss": 0.26156085729599, + "step": 22530 + }, + { + "ce_loss": 0.014919279143214226, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "distill_loss": 0.17853476107120514, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "ref_ce_loss": 0.05031314119696617, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "loss": 0.22457215189933777, + "step": 22530 + }, + { + "ce_loss": 0.05354100838303566, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "distill_loss": 0.11665277928113937, + "epoch": 7.515010006671114, + "step": 22530 + }, + { + "epoch": 7.515010006671114, + "ref_ce_loss": 0.05427921563386917, + "step": 22530 + }, + { + "epoch": 7.51834556370914, + "loss": 0.3382, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "grad_norm": 3.0563387870788574, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "learning_rate": 2.842824320395376e-06, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "loss": 0.3265399932861328, + "step": 22540 + }, + { + "ce_loss": 0.036622580140829086, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "distill_loss": 0.18803177773952484, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "ref_ce_loss": 0.05568910762667656, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "loss": 0.3589942455291748, + "step": 22540 + }, + { + "ce_loss": 0.07598867267370224, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "distill_loss": 0.2123461663722992, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "ref_ce_loss": 0.051019832491874695, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "loss": 0.19223347306251526, + "step": 22540 + }, + { + "ce_loss": 0.024045389145612717, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "distill_loss": 0.12357673048973083, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "ref_ce_loss": 0.0445307120680809, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "loss": 0.242498517036438, + "step": 22540 + }, + { + "ce_loss": 0.021375758573412895, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "distill_loss": 0.1691853255033493, + "epoch": 7.51834556370914, + "step": 22540 + }, + { + "epoch": 7.51834556370914, + "ref_ce_loss": 0.051571715623140335, + "step": 22540 + }, + { + "epoch": 7.521681120747164, + "loss": 0.3048, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "grad_norm": 3.6586086750030518, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "learning_rate": 2.80370905791501e-06, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "loss": 0.18601231276988983, + "step": 22550 + }, + { + "ce_loss": 0.018930969759821892, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "distill_loss": 0.12149959802627563, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "ref_ce_loss": 0.03169435262680054, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "loss": 0.21780389547348022, + "step": 22550 + }, + { + "ce_loss": 0.017960339784622192, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "distill_loss": 0.10064493864774704, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "ref_ce_loss": 0.03087751753628254, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "loss": 0.26542627811431885, + "step": 22550 + }, + { + "ce_loss": 0.04006009176373482, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "distill_loss": 0.19296851754188538, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "ref_ce_loss": 0.032322581857442856, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "loss": 0.5728837251663208, + "step": 22550 + }, + { + "ce_loss": 0.023503512144088745, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "distill_loss": 0.22009573876857758, + "epoch": 7.521681120747164, + "step": 22550 + }, + { + "epoch": 7.521681120747164, + "ref_ce_loss": 0.05492442473769188, + "step": 22550 + }, + { + "epoch": 7.525016677785191, + "loss": 0.3103, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "grad_norm": 5.0174641609191895, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "learning_rate": 2.7648622234050955e-06, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "loss": 0.27467140555381775, + "step": 22560 + }, + { + "ce_loss": 0.02688334323465824, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "distill_loss": 0.17189669609069824, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "ref_ce_loss": 0.050332751125097275, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "loss": 0.24028046429157257, + "step": 22560 + }, + { + "ce_loss": 0.021113159134984016, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "distill_loss": 0.11697718501091003, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "ref_ce_loss": 0.061597343534231186, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "loss": 0.19227828085422516, + "step": 22560 + }, + { + "ce_loss": 0.008966523222625256, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "distill_loss": 0.122565358877182, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "ref_ce_loss": 0.038611847907304764, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "loss": 0.3993758261203766, + "step": 22560 + }, + { + "ce_loss": 0.018132777884602547, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "distill_loss": 0.21471086144447327, + "epoch": 7.525016677785191, + "step": 22560 + }, + { + "epoch": 7.525016677785191, + "ref_ce_loss": 0.0625390112400055, + "step": 22560 + }, + { + "epoch": 7.528352234823215, + "loss": 0.3156, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "grad_norm": 3.8104166984558105, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "learning_rate": 2.7262838877069982e-06, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "loss": 0.36098623275756836, + "step": 22570 + }, + { + "ce_loss": 0.04272822290658951, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "distill_loss": 0.17549145221710205, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "ref_ce_loss": 0.05325407162308693, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "loss": 0.31836822628974915, + "step": 22570 + }, + { + "ce_loss": 0.03900127857923508, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "distill_loss": 0.1361355483531952, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "ref_ce_loss": 0.07174643129110336, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "loss": 0.37250638008117676, + "step": 22570 + }, + { + "ce_loss": 0.02075023390352726, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "distill_loss": 0.19733430445194244, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "ref_ce_loss": 0.05269167572259903, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "loss": 0.4085127115249634, + "step": 22570 + }, + { + "ce_loss": 0.08139225840568542, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "distill_loss": 0.15659675002098083, + "epoch": 7.528352234823215, + "step": 22570 + }, + { + "epoch": 7.528352234823215, + "ref_ce_loss": 0.0547124482691288, + "step": 22570 + }, + { + "epoch": 7.531687791861241, + "loss": 0.3288, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "grad_norm": 3.1944923400878906, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "learning_rate": 2.687974121172326e-06, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "loss": 0.4330209791660309, + "step": 22580 + }, + { + "ce_loss": 0.05265730619430542, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "distill_loss": 0.21802186965942383, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "ref_ce_loss": 0.05064508691430092, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "loss": 0.2763577401638031, + "step": 22580 + }, + { + "ce_loss": 0.0059791612438857555, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "distill_loss": 0.19465699791908264, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "ref_ce_loss": 0.04413512349128723, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "loss": 0.15393657982349396, + "step": 22580 + }, + { + "ce_loss": 0.031021667644381523, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "distill_loss": 0.08659686893224716, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "ref_ce_loss": 0.02658427506685257, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "loss": 0.21959280967712402, + "step": 22580 + }, + { + "ce_loss": 0.0054153925739228725, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "distill_loss": 0.14491917192935944, + "epoch": 7.531687791861241, + "step": 22580 + }, + { + "epoch": 7.531687791861241, + "ref_ce_loss": 0.036969490349292755, + "step": 22580 + }, + { + "epoch": 7.535023348899266, + "loss": 0.3174, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "grad_norm": 3.328404188156128, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "learning_rate": 2.649932993663012e-06, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "loss": 0.42319804430007935, + "step": 22590 + }, + { + "ce_loss": 0.030848875641822815, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "distill_loss": 0.3383166790008545, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "ref_ce_loss": 0.041436757892370224, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "loss": 0.5148337483406067, + "step": 22590 + }, + { + "ce_loss": 0.012426701374351978, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "distill_loss": 0.17431752383708954, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "ref_ce_loss": 0.10234980285167694, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "loss": 0.24014095962047577, + "step": 22590 + }, + { + "ce_loss": 0.0075201112776994705, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "distill_loss": 0.16021908819675446, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "ref_ce_loss": 0.044868361204862595, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "loss": 0.27141350507736206, + "step": 22590 + }, + { + "ce_loss": 0.042211223393678665, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "distill_loss": 0.10830758512020111, + "epoch": 7.535023348899266, + "step": 22590 + }, + { + "epoch": 7.535023348899266, + "ref_ce_loss": 0.061452366411685944, + "step": 22590 + }, + { + "epoch": 7.538358905937292, + "loss": 0.3597, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "grad_norm": 2.722214460372925, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "learning_rate": 2.6121605745510475e-06, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "loss": 0.3262496888637543, + "step": 22600 + }, + { + "ce_loss": 0.034692954272031784, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "distill_loss": 0.23076552152633667, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "ref_ce_loss": 0.04774779826402664, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "loss": 0.4575178325176239, + "step": 22600 + }, + { + "ce_loss": 0.05895520746707916, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "distill_loss": 0.2657552659511566, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "ref_ce_loss": 0.09668312966823578, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "loss": 0.3082955479621887, + "step": 22600 + }, + { + "ce_loss": 0.013990761712193489, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "distill_loss": 0.2193032056093216, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "ref_ce_loss": 0.04674745351076126, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "loss": 0.21660171449184418, + "step": 22600 + }, + { + "ce_loss": 0.020478934049606323, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "distill_loss": 0.13979895412921906, + "epoch": 7.538358905937292, + "step": 22600 + }, + { + "epoch": 7.538358905937292, + "ref_ce_loss": 0.0560581237077713, + "step": 22600 + }, + { + "epoch": 7.541694462975316, + "loss": 0.2894, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "grad_norm": 1.874458909034729, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "learning_rate": 2.574656932718433e-06, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "loss": 0.32055795192718506, + "step": 22610 + }, + { + "ce_loss": 0.028949877247214317, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "distill_loss": 0.1181693822145462, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "ref_ce_loss": 0.048266347497701645, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "loss": 0.26999005675315857, + "step": 22610 + }, + { + "ce_loss": 0.0427066795527935, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "distill_loss": 0.13798220455646515, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "ref_ce_loss": 0.041861649602651596, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "loss": 0.37607231736183167, + "step": 22610 + }, + { + "ce_loss": 0.023039717227220535, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "distill_loss": 0.21862857043743134, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "ref_ce_loss": 0.05255983769893646, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "loss": 0.3798355758190155, + "step": 22610 + }, + { + "ce_loss": 0.06908748298883438, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "distill_loss": 0.27530989050865173, + "epoch": 7.541694462975316, + "step": 22610 + }, + { + "epoch": 7.541694462975316, + "ref_ce_loss": 0.03530760481953621, + "step": 22610 + }, + { + "epoch": 7.545030020013343, + "loss": 0.3135, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "grad_norm": 3.8495988845825195, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "learning_rate": 2.5374221365570435e-06, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "loss": 0.3512667715549469, + "step": 22620 + }, + { + "ce_loss": 0.04440957307815552, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "distill_loss": 0.20252563059329987, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "ref_ce_loss": 0.0638410672545433, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "loss": 0.25497967004776, + "step": 22620 + }, + { + "ce_loss": 0.025934748351573944, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "distill_loss": 0.118730828166008, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "ref_ce_loss": 0.04510124400258064, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "loss": 0.33090710639953613, + "step": 22620 + }, + { + "ce_loss": 0.055013976991176605, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "distill_loss": 0.22313421964645386, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "ref_ce_loss": 0.05255810543894768, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "loss": 0.3884758949279785, + "step": 22620 + }, + { + "ce_loss": 0.015666665509343147, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "distill_loss": 0.23140020668506622, + "epoch": 7.545030020013343, + "step": 22620 + }, + { + "epoch": 7.545030020013343, + "ref_ce_loss": 0.061135634779930115, + "step": 22620 + }, + { + "epoch": 7.548365577051367, + "loss": 0.3352, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "grad_norm": 3.073249101638794, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "learning_rate": 2.50045625396843e-06, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "loss": 0.2203093022108078, + "step": 22630 + }, + { + "ce_loss": 0.002241781447082758, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "distill_loss": 0.17604470252990723, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "ref_ce_loss": 0.04158555343747139, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "loss": 0.17070531845092773, + "step": 22630 + }, + { + "ce_loss": 0.015439015813171864, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "distill_loss": 0.10008960217237473, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "ref_ce_loss": 0.03315176069736481, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "loss": 0.21806734800338745, + "step": 22630 + }, + { + "ce_loss": 0.044264134019613266, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "distill_loss": 0.12509466707706451, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "ref_ce_loss": 0.01920865662395954, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "loss": 0.3088948428630829, + "step": 22630 + }, + { + "ce_loss": 0.027766257524490356, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "distill_loss": 0.22647210955619812, + "epoch": 7.548365577051367, + "step": 22630 + }, + { + "epoch": 7.548365577051367, + "ref_ce_loss": 0.040508657693862915, + "step": 22630 + }, + { + "epoch": 7.551701134089393, + "loss": 0.306, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "grad_norm": 3.8144543170928955, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "learning_rate": 2.4637593523637866e-06, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "loss": 0.23312872648239136, + "step": 22640 + }, + { + "ce_loss": 0.032705824822187424, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "distill_loss": 0.12312594056129456, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "ref_ce_loss": 0.05955107882618904, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "loss": 0.5067400932312012, + "step": 22640 + }, + { + "ce_loss": 0.019309131428599358, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "distill_loss": 0.3579646050930023, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "ref_ce_loss": 0.0795922577381134, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "loss": 0.2373778223991394, + "step": 22640 + }, + { + "ce_loss": 0.03473667427897453, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "distill_loss": 0.13048987090587616, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "ref_ce_loss": 0.05845621973276138, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "loss": 0.21917320787906647, + "step": 22640 + }, + { + "ce_loss": 0.044745367020368576, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "distill_loss": 0.13405704498291016, + "epoch": 7.551701134089393, + "step": 22640 + }, + { + "epoch": 7.551701134089393, + "ref_ce_loss": 0.031917721033096313, + "step": 22640 + }, + { + "epoch": 7.555036691127418, + "loss": 0.3399, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "grad_norm": 2.414365768432617, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "learning_rate": 2.4273314986637813e-06, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "loss": 0.22703129053115845, + "step": 22650 + }, + { + "ce_loss": 0.024940047413110733, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "distill_loss": 0.13973286747932434, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "ref_ce_loss": 0.02829953469336033, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "loss": 0.21240630745887756, + "step": 22650 + }, + { + "ce_loss": 0.0018051156075671315, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "distill_loss": 0.14401955902576447, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "ref_ce_loss": 0.024986490607261658, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "loss": 0.4377570152282715, + "step": 22650 + }, + { + "ce_loss": 0.03963511064648628, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "distill_loss": 0.20817264914512634, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "ref_ce_loss": 0.055689506232738495, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "loss": 0.28146106004714966, + "step": 22650 + }, + { + "ce_loss": 0.03253614529967308, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "distill_loss": 0.13565385341644287, + "epoch": 7.555036691127418, + "step": 22650 + }, + { + "epoch": 7.555036691127418, + "ref_ce_loss": 0.04473499208688736, + "step": 22650 + }, + { + "epoch": 7.558372248165444, + "loss": 0.3169, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "grad_norm": 2.6511383056640625, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "learning_rate": 2.3911727592984597e-06, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "loss": 0.38556116819381714, + "step": 22660 + }, + { + "ce_loss": 0.033701617270708084, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "distill_loss": 0.2773969769477844, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "ref_ce_loss": 0.04286494478583336, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "loss": 0.2758292853832245, + "step": 22660 + }, + { + "ce_loss": 0.0252090934664011, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "distill_loss": 0.12194667011499405, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "ref_ce_loss": 0.04058091342449188, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "loss": 0.4743751287460327, + "step": 22660 + }, + { + "ce_loss": 0.10802572220563889, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "distill_loss": 0.2059323936700821, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "ref_ce_loss": 0.04329385235905647, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "loss": 0.26553261280059814, + "step": 22660 + }, + { + "ce_loss": 0.027559636160731316, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "distill_loss": 0.11232174932956696, + "epoch": 7.558372248165444, + "step": 22660 + }, + { + "epoch": 7.558372248165444, + "ref_ce_loss": 0.03847008943557739, + "step": 22660 + }, + { + "epoch": 7.5617078052034685, + "loss": 0.3085, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "grad_norm": 3.0139541625976562, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "learning_rate": 2.355283200207092e-06, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "loss": 0.4569222927093506, + "step": 22670 + }, + { + "ce_loss": 0.10298973321914673, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "distill_loss": 0.23029737174510956, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "ref_ce_loss": 0.08486660569906235, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "loss": 0.2001618891954422, + "step": 22670 + }, + { + "ce_loss": 0.03799591213464737, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "distill_loss": 0.1290132999420166, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "ref_ce_loss": 0.03295820206403732, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "loss": 0.23542585968971252, + "step": 22670 + }, + { + "ce_loss": 0.07174541801214218, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "distill_loss": 0.10788275301456451, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "ref_ce_loss": 0.038297832012176514, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "loss": 0.269972026348114, + "step": 22670 + }, + { + "ce_loss": 0.02759367786347866, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "distill_loss": 0.16550669074058533, + "epoch": 7.5617078052034685, + "step": 22670 + }, + { + "epoch": 7.5617078052034685, + "ref_ce_loss": 0.04217568784952164, + "step": 22670 + }, + { + "epoch": 7.565043362241495, + "loss": 0.3112, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "grad_norm": 3.507418155670166, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "learning_rate": 2.319662886838075e-06, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "loss": 0.18114669620990753, + "step": 22680 + }, + { + "ce_loss": 0.031178679317235947, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "distill_loss": 0.10116644203662872, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "ref_ce_loss": 0.04861505329608917, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "loss": 0.32580307126045227, + "step": 22680 + }, + { + "ce_loss": 0.04682064428925514, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "distill_loss": 0.2161271870136261, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "ref_ce_loss": 0.062445301562547684, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "loss": 0.31231689453125, + "step": 22680 + }, + { + "ce_loss": 0.012379839085042477, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "distill_loss": 0.19624949991703033, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "ref_ce_loss": 0.054930321872234344, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "loss": 0.43421971797943115, + "step": 22680 + }, + { + "ce_loss": 0.07423277199268341, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "distill_loss": 0.15245267748832703, + "epoch": 7.565043362241495, + "step": 22680 + }, + { + "epoch": 7.565043362241495, + "ref_ce_loss": 0.06348980963230133, + "step": 22680 + }, + { + "epoch": 7.568378919279519, + "loss": 0.3025, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "grad_norm": 2.772374153137207, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "learning_rate": 2.2843118841488315e-06, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "loss": 0.3740849494934082, + "step": 22690 + }, + { + "ce_loss": 0.05600299686193466, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "distill_loss": 0.1683557629585266, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "ref_ce_loss": 0.051150351762771606, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "loss": 0.172612264752388, + "step": 22690 + }, + { + "ce_loss": 0.013495842926204205, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "distill_loss": 0.1024136021733284, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "ref_ce_loss": 0.03655596449971199, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "loss": 0.22008052468299866, + "step": 22690 + }, + { + "ce_loss": 0.029409054666757584, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "distill_loss": 0.1552516669034958, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "ref_ce_loss": 0.03526454046368599, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "loss": 0.4758220314979553, + "step": 22690 + }, + { + "ce_loss": 0.04872387647628784, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "distill_loss": 0.2343422919511795, + "epoch": 7.568378919279519, + "step": 22690 + }, + { + "epoch": 7.568378919279519, + "ref_ce_loss": 0.04983096569776535, + "step": 22690 + }, + { + "epoch": 7.5717144763175455, + "loss": 0.3149, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "grad_norm": 4.173081874847412, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "learning_rate": 2.249230256605611e-06, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "loss": 0.2925843894481659, + "step": 22700 + }, + { + "ce_loss": 0.03678572177886963, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "distill_loss": 0.17235167324543, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "ref_ce_loss": 0.06548989564180374, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "loss": 0.2966710925102234, + "step": 22700 + }, + { + "ce_loss": 0.0343879796564579, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "distill_loss": 0.20626069605350494, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "ref_ce_loss": 0.03526905179023743, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "loss": 0.27978846430778503, + "step": 22700 + }, + { + "ce_loss": 0.05245629698038101, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "distill_loss": 0.18162769079208374, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "ref_ce_loss": 0.04530799016356468, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "loss": 0.18752911686897278, + "step": 22700 + }, + { + "ce_loss": 0.008691946044564247, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "distill_loss": 0.10002614557743073, + "epoch": 7.5717144763175455, + "step": 22700 + }, + { + "epoch": 7.5717144763175455, + "ref_ce_loss": 0.03438608720898628, + "step": 22700 + }, + { + "epoch": 7.57505003335557, + "loss": 0.3044, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "grad_norm": 3.8426270484924316, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "learning_rate": 2.214418068183471e-06, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "loss": 0.29993754625320435, + "step": 22710 + }, + { + "ce_loss": 0.002770091639831662, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "distill_loss": 0.2069040983915329, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "ref_ce_loss": 0.0428432896733284, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "loss": 0.27584877610206604, + "step": 22710 + }, + { + "ce_loss": 0.07741032540798187, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "distill_loss": 0.14802144467830658, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "ref_ce_loss": 0.038278382271528244, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "loss": 0.1868162751197815, + "step": 22710 + }, + { + "ce_loss": 0.024370383471250534, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "distill_loss": 0.10414793342351913, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "ref_ce_loss": 0.03851045295596123, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "loss": 0.24429672956466675, + "step": 22710 + }, + { + "ce_loss": 0.009652115404605865, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "distill_loss": 0.10027230530977249, + "epoch": 7.57505003335557, + "step": 22710 + }, + { + "epoch": 7.57505003335557, + "ref_ce_loss": 0.04407824948430061, + "step": 22710 + }, + { + "epoch": 7.578385590393596, + "loss": 0.3243, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "grad_norm": 3.664708375930786, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "learning_rate": 2.1798753823661308e-06, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "loss": 0.3406020700931549, + "step": 22720 + }, + { + "ce_loss": 0.06698433309793472, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "distill_loss": 0.1397649645805359, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "ref_ce_loss": 0.04336464777588844, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "loss": 0.7421815395355225, + "step": 22720 + }, + { + "ce_loss": 0.09118698537349701, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "distill_loss": 0.29913344979286194, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "ref_ce_loss": 0.09334400296211243, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "loss": 0.2611655294895172, + "step": 22720 + }, + { + "ce_loss": 0.0070807188749313354, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "distill_loss": 0.14511138200759888, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "ref_ce_loss": 0.038073521107435226, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "loss": 0.26241040229797363, + "step": 22720 + }, + { + "ce_loss": 0.04022922366857529, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "distill_loss": 0.1806827187538147, + "epoch": 7.578385590393596, + "step": 22720 + }, + { + "epoch": 7.578385590393596, + "ref_ce_loss": 0.03285058215260506, + "step": 22720 + }, + { + "epoch": 7.581721147431621, + "loss": 0.337, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "grad_norm": 3.567667007446289, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "learning_rate": 2.1456022621458347e-06, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "loss": 0.3689475655555725, + "step": 22730 + }, + { + "ce_loss": 0.10052093863487244, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "distill_loss": 0.17419344186782837, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "ref_ce_loss": 0.038105469197034836, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "loss": 0.3528992533683777, + "step": 22730 + }, + { + "ce_loss": 0.025013385340571404, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "distill_loss": 0.25239109992980957, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "ref_ce_loss": 0.039866313338279724, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "loss": 0.3315299451351166, + "step": 22730 + }, + { + "ce_loss": 0.045371163636446, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "distill_loss": 0.18333026766777039, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "ref_ce_loss": 0.05455062538385391, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "loss": 0.1749919056892395, + "step": 22730 + }, + { + "ce_loss": 0.004356476478278637, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "distill_loss": 0.12413419783115387, + "epoch": 7.581721147431621, + "step": 22730 + }, + { + "epoch": 7.581721147431621, + "ref_ce_loss": 0.03222940117120743, + "step": 22730 + }, + { + "epoch": 7.585056704469647, + "loss": 0.2909, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "grad_norm": 3.138673782348633, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "learning_rate": 2.1115987700231873e-06, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "loss": 0.20833194255828857, + "step": 22740 + }, + { + "ce_loss": 0.04004382714629173, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "distill_loss": 0.12045399099588394, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "ref_ce_loss": 0.02083931490778923, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "loss": 0.2998196482658386, + "step": 22740 + }, + { + "ce_loss": 0.04018447920680046, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "distill_loss": 0.15190063416957855, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "ref_ce_loss": 0.03249194100499153, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "loss": 0.1679462492465973, + "step": 22740 + }, + { + "ce_loss": 0.007802645675837994, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "distill_loss": 0.11544067412614822, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "ref_ce_loss": 0.032458383589982986, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "loss": 0.31334739923477173, + "step": 22740 + }, + { + "ce_loss": 0.0546376071870327, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "distill_loss": 0.20922823250293732, + "epoch": 7.585056704469647, + "step": 22740 + }, + { + "epoch": 7.585056704469647, + "ref_ce_loss": 0.04924513027071953, + "step": 22740 + }, + { + "epoch": 7.588392261507671, + "loss": 0.2875, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "grad_norm": 2.8379061222076416, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "learning_rate": 2.0778649680071867e-06, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "loss": 0.5469861030578613, + "step": 22750 + }, + { + "ce_loss": 0.0687357634305954, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "distill_loss": 0.3410850465297699, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "ref_ce_loss": 0.08484728634357452, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "loss": 0.5010547041893005, + "step": 22750 + }, + { + "ce_loss": 0.010975878685712814, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "distill_loss": 0.329995721578598, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "ref_ce_loss": 0.05636501684784889, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "loss": 0.25946545600891113, + "step": 22750 + }, + { + "ce_loss": 0.02155529521405697, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "distill_loss": 0.1841026395559311, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "ref_ce_loss": 0.05362752079963684, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "loss": 0.20447981357574463, + "step": 22750 + }, + { + "ce_loss": 0.015658462420105934, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "distill_loss": 0.13859112560749054, + "epoch": 7.588392261507671, + "step": 22750 + }, + { + "epoch": 7.588392261507671, + "ref_ce_loss": 0.050109222531318665, + "step": 22750 + }, + { + "epoch": 7.591727818545698, + "loss": 0.3008, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "grad_norm": 2.1122565269470215, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "learning_rate": 2.0444009176149414e-06, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "loss": 0.2298000007867813, + "step": 22760 + }, + { + "ce_loss": 0.018491631373763084, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "distill_loss": 0.12337653338909149, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "ref_ce_loss": 0.03470870107412338, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "loss": 0.30644336342811584, + "step": 22760 + }, + { + "ce_loss": 0.022262774407863617, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "distill_loss": 0.21743980050086975, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "ref_ce_loss": 0.027649324387311935, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "loss": 0.15351411700248718, + "step": 22760 + }, + { + "ce_loss": 0.014782496728003025, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "distill_loss": 0.09903530776500702, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "ref_ce_loss": 0.03957769274711609, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "loss": 0.3583006262779236, + "step": 22760 + }, + { + "ce_loss": 0.07922881096601486, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "distill_loss": 0.15731580555438995, + "epoch": 7.591727818545698, + "step": 22760 + }, + { + "epoch": 7.591727818545698, + "ref_ce_loss": 0.05712786689400673, + "step": 22760 + }, + { + "epoch": 7.595063375583722, + "loss": 0.3052, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "grad_norm": 4.922539234161377, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "learning_rate": 2.011206679871702e-06, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "loss": 0.2932712733745575, + "step": 22770 + }, + { + "ce_loss": 0.03609336167573929, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "distill_loss": 0.17725861072540283, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "ref_ce_loss": 0.056249652057886124, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "loss": 0.2576265335083008, + "step": 22770 + }, + { + "ce_loss": 0.030701031908392906, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "distill_loss": 0.16068829596042633, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "ref_ce_loss": 0.04590294510126114, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "loss": 0.5291174054145813, + "step": 22770 + }, + { + "ce_loss": 0.012448878027498722, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "distill_loss": 0.2315932810306549, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "ref_ce_loss": 0.052382610738277435, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "loss": 0.5043610334396362, + "step": 22770 + }, + { + "ce_loss": 0.06939585506916046, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "distill_loss": 0.2098284661769867, + "epoch": 7.595063375583722, + "step": 22770 + }, + { + "epoch": 7.595063375583722, + "ref_ce_loss": 0.07914602756500244, + "step": 22770 + }, + { + "epoch": 7.598398932621748, + "loss": 0.3557, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "grad_norm": 5.053133487701416, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "learning_rate": 1.9782823153106808e-06, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "loss": 0.3108091354370117, + "step": 22780 + }, + { + "ce_loss": 0.008028519339859486, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "distill_loss": 0.15519407391548157, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "ref_ce_loss": 0.0441800020635128, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "loss": 0.2354048192501068, + "step": 22780 + }, + { + "ce_loss": 0.04343513026833534, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "distill_loss": 0.1344708949327469, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "ref_ce_loss": 0.0410119891166687, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "loss": 0.3621920049190521, + "step": 22780 + }, + { + "ce_loss": 0.08181270211935043, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "distill_loss": 0.21230238676071167, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "ref_ce_loss": 0.056477129459381104, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "loss": 0.2663940489292145, + "step": 22780 + }, + { + "ce_loss": 0.036799490451812744, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "distill_loss": 0.1391897201538086, + "epoch": 7.598398932621748, + "step": 22780 + }, + { + "epoch": 7.598398932621748, + "ref_ce_loss": 0.05978408828377724, + "step": 22780 + }, + { + "epoch": 7.601734489659773, + "loss": 0.3431, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "grad_norm": 5.396562576293945, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "learning_rate": 1.9456278839729165e-06, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "loss": 0.2972421944141388, + "step": 22790 + }, + { + "ce_loss": 0.00912957452237606, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "distill_loss": 0.10416600853204727, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "ref_ce_loss": 0.024486741051077843, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "loss": 0.3115427494049072, + "step": 22790 + }, + { + "ce_loss": 0.09411104768514633, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "distill_loss": 0.13358083367347717, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "ref_ce_loss": 0.05374515801668167, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "loss": 0.25783830881118774, + "step": 22790 + }, + { + "ce_loss": 0.0035260599106550217, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "distill_loss": 0.16822956502437592, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "ref_ce_loss": 0.05114159360527992, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "loss": 0.17619512975215912, + "step": 22790 + }, + { + "ce_loss": 0.0018359140958637, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "distill_loss": 0.1140362024307251, + "epoch": 7.601734489659773, + "step": 22790 + }, + { + "epoch": 7.601734489659773, + "ref_ce_loss": 0.021149538457393646, + "step": 22790 + }, + { + "epoch": 7.605070046697799, + "loss": 0.302, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "grad_norm": 2.289646863937378, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "learning_rate": 1.913243445407192e-06, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "loss": 0.4405611455440521, + "step": 22800 + }, + { + "ce_loss": 0.03355856612324715, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "distill_loss": 0.11231415718793869, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "ref_ce_loss": 0.041811395436525345, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "loss": 0.30768993496894836, + "step": 22800 + }, + { + "ce_loss": 0.02752743847668171, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "distill_loss": 0.15471151471138, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "ref_ce_loss": 0.03334573283791542, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "loss": 0.18778252601623535, + "step": 22800 + }, + { + "ce_loss": 0.005070790182799101, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "distill_loss": 0.10848856717348099, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "ref_ce_loss": 0.03893861174583435, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "loss": 0.4052788019180298, + "step": 22800 + }, + { + "ce_loss": 0.05161317065358162, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "distill_loss": 0.2581633925437927, + "epoch": 7.605070046697799, + "step": 22800 + }, + { + "epoch": 7.605070046697799, + "ref_ce_loss": 0.06759831309318542, + "step": 22800 + }, + { + "epoch": 7.608405603735823, + "loss": 0.3476, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "grad_norm": 3.0757687091827393, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "learning_rate": 1.8811290586699834e-06, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "loss": 0.26413461565971375, + "step": 22810 + }, + { + "ce_loss": 0.03497738763689995, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "distill_loss": 0.1722661554813385, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "ref_ce_loss": 0.05670714005827904, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "loss": 0.3696792721748352, + "step": 22810 + }, + { + "ce_loss": 0.03719585761427879, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "distill_loss": 0.19359298050403595, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "ref_ce_loss": 0.05511196702718735, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "loss": 0.2747287154197693, + "step": 22810 + }, + { + "ce_loss": 0.02445542812347412, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "distill_loss": 0.1185799315571785, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "ref_ce_loss": 0.042290497571229935, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "loss": 0.5267252326011658, + "step": 22810 + }, + { + "ce_loss": 0.009008359163999557, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "distill_loss": 0.27464696764945984, + "epoch": 7.608405603735823, + "step": 22810 + }, + { + "epoch": 7.608405603735823, + "ref_ce_loss": 0.1000165343284607, + "step": 22810 + }, + { + "epoch": 7.61174116077385, + "loss": 0.3708, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "grad_norm": 4.702095031738281, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "learning_rate": 1.849284782325211e-06, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "loss": 0.5826479196548462, + "step": 22820 + }, + { + "ce_loss": 0.055447544902563095, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "distill_loss": 0.3116193115711212, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "ref_ce_loss": 0.08707407116889954, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "loss": 0.2422042340040207, + "step": 22820 + }, + { + "ce_loss": 0.004890437703579664, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "distill_loss": 0.17459377646446228, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "ref_ce_loss": 0.062421608716249466, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "loss": 0.8745191097259521, + "step": 22820 + }, + { + "ce_loss": 0.024280430749058723, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "distill_loss": 0.18831610679626465, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "ref_ce_loss": 0.06424184143543243, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "loss": 0.27096831798553467, + "step": 22820 + }, + { + "ce_loss": 0.02626364678144455, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "distill_loss": 0.15062075853347778, + "epoch": 7.61174116077385, + "step": 22820 + }, + { + "epoch": 7.61174116077385, + "ref_ce_loss": 0.03873790428042412, + "step": 22820 + }, + { + "epoch": 7.615076717811874, + "loss": 0.3557, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "grad_norm": 2.347228765487671, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "learning_rate": 1.8177106744443392e-06, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "loss": 0.4428894519805908, + "step": 22830 + }, + { + "ce_loss": 0.011006324551999569, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "distill_loss": 0.10822677612304688, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "ref_ce_loss": 0.04102815315127373, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "loss": 0.4443509578704834, + "step": 22830 + }, + { + "ce_loss": 0.012050008401274681, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "distill_loss": 0.31021803617477417, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "ref_ce_loss": 0.0780838206410408, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "loss": 0.3268548846244812, + "step": 22830 + }, + { + "ce_loss": 0.0733487457036972, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "distill_loss": 0.14904393255710602, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "ref_ce_loss": 0.06276282668113708, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "loss": 0.3149307370185852, + "step": 22830 + }, + { + "ce_loss": 0.012128917500376701, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "distill_loss": 0.17375248670578003, + "epoch": 7.615076717811874, + "step": 22830 + }, + { + "epoch": 7.615076717811874, + "ref_ce_loss": 0.03347927704453468, + "step": 22830 + }, + { + "epoch": 7.6184122748499, + "loss": 0.3563, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "grad_norm": 7.628005504608154, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "learning_rate": 1.7864067926060432e-06, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "loss": 0.45842376351356506, + "step": 22840 + }, + { + "ce_loss": 0.07494988292455673, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "distill_loss": 0.21095065772533417, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "ref_ce_loss": 0.0487661212682724, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "loss": 0.3503536581993103, + "step": 22840 + }, + { + "ce_loss": 0.029765352606773376, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "distill_loss": 0.19903971254825592, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "ref_ce_loss": 0.05890416353940964, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "loss": 0.2923668324947357, + "step": 22840 + }, + { + "ce_loss": 0.0437467023730278, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "distill_loss": 0.1662764549255371, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "ref_ce_loss": 0.08174384385347366, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "loss": 0.21276073157787323, + "step": 22840 + }, + { + "ce_loss": 0.014563250355422497, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "distill_loss": 0.12940552830696106, + "epoch": 7.6184122748499, + "step": 22840 + }, + { + "epoch": 7.6184122748499, + "ref_ce_loss": 0.04358154535293579, + "step": 22840 + }, + { + "epoch": 7.621747831887925, + "loss": 0.32, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "grad_norm": 3.632291316986084, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "learning_rate": 1.7553731938962756e-06, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "loss": 0.37080612778663635, + "step": 22850 + }, + { + "ce_loss": 0.03212735056877136, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "distill_loss": 0.2639005780220032, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "ref_ce_loss": 0.050537895411252975, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "loss": 0.30988186597824097, + "step": 22850 + }, + { + "ce_loss": 0.07211822271347046, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "distill_loss": 0.15322273969650269, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "ref_ce_loss": 0.05520307272672653, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "loss": 0.18619802594184875, + "step": 22850 + }, + { + "ce_loss": 0.0006898845313116908, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "distill_loss": 0.10551872849464417, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "ref_ce_loss": 0.02266588620841503, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "loss": 0.16827353835105896, + "step": 22850 + }, + { + "ce_loss": 0.029503915458917618, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "distill_loss": 0.09243550896644592, + "epoch": 7.621747831887925, + "step": 22850 + }, + { + "epoch": 7.621747831887925, + "ref_ce_loss": 0.03407926857471466, + "step": 22850 + }, + { + "epoch": 7.625083388925951, + "loss": 0.2909, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "grad_norm": 2.7291767597198486, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "learning_rate": 1.7246099349080665e-06, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "loss": 0.213385671377182, + "step": 22860 + }, + { + "ce_loss": 0.028410280123353004, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "distill_loss": 0.12494742125272751, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "ref_ce_loss": 0.04404313489794731, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "loss": 0.28631460666656494, + "step": 22860 + }, + { + "ce_loss": 0.014288785867393017, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "distill_loss": 0.17968155443668365, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "ref_ce_loss": 0.0649716928601265, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "loss": 0.2869929075241089, + "step": 22860 + }, + { + "ce_loss": 0.041470956057310104, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "distill_loss": 0.19409871101379395, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "ref_ce_loss": 0.037282612174749374, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "loss": 0.23375526070594788, + "step": 22860 + }, + { + "ce_loss": 0.048656996339559555, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "distill_loss": 0.115402951836586, + "epoch": 7.625083388925951, + "step": 22860 + }, + { + "epoch": 7.625083388925951, + "ref_ce_loss": 0.047733623534440994, + "step": 22860 + }, + { + "epoch": 7.6284189459639755, + "loss": 0.3182, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "grad_norm": 1.8855068683624268, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "learning_rate": 1.6941170717414577e-06, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "loss": 0.23261196911334991, + "step": 22870 + }, + { + "ce_loss": 0.02123933471739292, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "distill_loss": 0.10758798569440842, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "ref_ce_loss": 0.049759477376937866, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "loss": 0.2860763370990753, + "step": 22870 + }, + { + "ce_loss": 0.06312866508960724, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "distill_loss": 0.11870720982551575, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "ref_ce_loss": 0.041579004377126694, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "loss": 0.5179301500320435, + "step": 22870 + }, + { + "ce_loss": 0.03702806681394577, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "distill_loss": 0.2259015291929245, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "ref_ce_loss": 0.0677097737789154, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "loss": 0.20624791085720062, + "step": 22870 + }, + { + "ce_loss": 0.01307401992380619, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "distill_loss": 0.13857774436473846, + "epoch": 7.6284189459639755, + "step": 22870 + }, + { + "epoch": 7.6284189459639755, + "ref_ce_loss": 0.054351504892110825, + "step": 22870 + }, + { + "epoch": 7.631754503002002, + "loss": 0.3066, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "grad_norm": 3.6401045322418213, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "learning_rate": 1.6638946600034175e-06, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "loss": 0.22979536652565002, + "step": 22880 + }, + { + "ce_loss": 0.02197353169322014, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "distill_loss": 0.1502780318260193, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "ref_ce_loss": 0.05745101720094681, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "loss": 0.4088671803474426, + "step": 22880 + }, + { + "ce_loss": 0.06212860345840454, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "distill_loss": 0.12440603971481323, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "ref_ce_loss": 0.06890290975570679, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "loss": 0.3185475170612335, + "step": 22880 + }, + { + "ce_loss": 0.045057594776153564, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "distill_loss": 0.2054925560951233, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "ref_ce_loss": 0.024748776108026505, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "loss": 0.27203235030174255, + "step": 22880 + }, + { + "ce_loss": 0.015324220061302185, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "distill_loss": 0.14852091670036316, + "epoch": 7.631754503002002, + "step": 22880 + }, + { + "epoch": 7.631754503002002, + "ref_ce_loss": 0.03557129576802254, + "step": 22880 + }, + { + "epoch": 7.635090060040026, + "loss": 0.3026, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "grad_norm": 4.086019515991211, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "learning_rate": 1.6339427548076934e-06, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "loss": 0.20336809754371643, + "step": 22890 + }, + { + "ce_loss": 0.015538395382463932, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "distill_loss": 0.11135879158973694, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "ref_ce_loss": 0.04804733768105507, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "loss": 0.24145036935806274, + "step": 22890 + }, + { + "ce_loss": 0.033301178365945816, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "distill_loss": 0.12386928498744965, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "ref_ce_loss": 0.04221475124359131, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "loss": 0.197758749127388, + "step": 22890 + }, + { + "ce_loss": 0.0030058466363698244, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "distill_loss": 0.11973407119512558, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "ref_ce_loss": 0.05511556565761566, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "loss": 0.24757030606269836, + "step": 22890 + }, + { + "ce_loss": 0.04953973740339279, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "distill_loss": 0.13940845429897308, + "epoch": 7.635090060040026, + "step": 22890 + }, + { + "epoch": 7.635090060040026, + "ref_ce_loss": 0.037517912685871124, + "step": 22890 + }, + { + "epoch": 7.6384256170780525, + "loss": 0.3258, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "grad_norm": 5.1843180656433105, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "learning_rate": 1.6042614107747597e-06, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "loss": 0.2708713114261627, + "step": 22900 + }, + { + "ce_loss": 0.026853064075112343, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "distill_loss": 0.1342555582523346, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "ref_ce_loss": 0.04602363333106041, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "loss": 0.2061038613319397, + "step": 22900 + }, + { + "ce_loss": 0.021708890795707703, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "distill_loss": 0.10948806256055832, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "ref_ce_loss": 0.0385892391204834, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "loss": 0.20218154788017273, + "step": 22900 + }, + { + "ce_loss": 0.03984569385647774, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "distill_loss": 0.1096130758523941, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "ref_ce_loss": 0.03738179802894592, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "loss": 0.4746226966381073, + "step": 22900 + }, + { + "ce_loss": 0.04614961892366409, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "distill_loss": 0.25767990946769714, + "epoch": 7.6384256170780525, + "step": 22900 + }, + { + "epoch": 7.6384256170780525, + "ref_ce_loss": 0.059538114815950394, + "step": 22900 + }, + { + "epoch": 7.641761174116077, + "loss": 0.3217, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "grad_norm": 3.924365997314453, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "learning_rate": 1.5748506820316697e-06, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "loss": 0.5141680836677551, + "step": 22910 + }, + { + "ce_loss": 0.08010391145944595, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "distill_loss": 0.21418240666389465, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "ref_ce_loss": 0.05140373855829239, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "loss": 0.2225308120250702, + "step": 22910 + }, + { + "ce_loss": 0.02417745813727379, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "distill_loss": 0.11761415004730225, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "ref_ce_loss": 0.05385226756334305, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "loss": 0.2561490833759308, + "step": 22910 + }, + { + "ce_loss": 0.037857986986637115, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "distill_loss": 0.1793629229068756, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "ref_ce_loss": 0.03873812034726143, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "loss": 0.2256598174571991, + "step": 22910 + }, + { + "ce_loss": 0.03303292766213417, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "distill_loss": 0.11392819881439209, + "epoch": 7.641761174116077, + "step": 22910 + }, + { + "epoch": 7.641761174116077, + "ref_ce_loss": 0.05216430127620697, + "step": 22910 + }, + { + "epoch": 7.645096731154103, + "loss": 0.3363, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "grad_norm": 5.04368257522583, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "learning_rate": 1.5457106222120042e-06, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "loss": 0.45411041378974915, + "step": 22920 + }, + { + "ce_loss": 0.033568039536476135, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "distill_loss": 0.2195194661617279, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "ref_ce_loss": 0.038000307977199554, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "loss": 0.30707457661628723, + "step": 22920 + }, + { + "ce_loss": 0.027316883206367493, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "distill_loss": 0.2306128442287445, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "ref_ce_loss": 0.04899785295128822, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "loss": 0.26368802785873413, + "step": 22920 + }, + { + "ce_loss": 0.04824545979499817, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "distill_loss": 0.1530880630016327, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "ref_ce_loss": 0.050180789083242416, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "loss": 0.25279903411865234, + "step": 22920 + }, + { + "ce_loss": 0.030409902334213257, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "distill_loss": 0.13697512447834015, + "epoch": 7.645096731154103, + "step": 22920 + }, + { + "epoch": 7.645096731154103, + "ref_ce_loss": 0.05594718083739281, + "step": 22920 + }, + { + "epoch": 7.648432288192128, + "loss": 0.3521, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "grad_norm": 2.7559688091278076, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "learning_rate": 1.5168412844557055e-06, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "loss": 0.399747759103775, + "step": 22930 + }, + { + "ce_loss": 0.016245001927018166, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "distill_loss": 0.12318704277276993, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "ref_ce_loss": 0.08122338354587555, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "loss": 0.22639451920986176, + "step": 22930 + }, + { + "ce_loss": 0.0014325794763863087, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "distill_loss": 0.12101196497678757, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "ref_ce_loss": 0.059808388352394104, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "loss": 0.34030061960220337, + "step": 22930 + }, + { + "ce_loss": 0.04827611520886421, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "distill_loss": 0.09970467537641525, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "ref_ce_loss": 0.030907966196537018, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "loss": 0.1849261373281479, + "step": 22930 + }, + { + "ce_loss": 0.018460562452673912, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "distill_loss": 0.10423801094293594, + "epoch": 7.648432288192128, + "step": 22930 + }, + { + "epoch": 7.648432288192128, + "ref_ce_loss": 0.03969254717230797, + "step": 22930 + }, + { + "epoch": 7.651767845230154, + "loss": 0.2977, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "grad_norm": 3.0878891944885254, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "learning_rate": 1.4882427214090776e-06, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "loss": 0.4689764380455017, + "step": 22940 + }, + { + "ce_loss": 0.006221160292625427, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "distill_loss": 0.3133019208908081, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "ref_ce_loss": 0.06789840757846832, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "loss": 0.18552368879318237, + "step": 22940 + }, + { + "ce_loss": 0.015950776636600494, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "distill_loss": 0.11873391270637512, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "ref_ce_loss": 0.05071733891963959, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "loss": 0.31022709608078003, + "step": 22940 + }, + { + "ce_loss": 0.04275962710380554, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "distill_loss": 0.1239824965596199, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "ref_ce_loss": 0.029431330040097237, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "loss": 0.151072695851326, + "step": 22940 + }, + { + "ce_loss": 0.027431553229689598, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "distill_loss": 0.08233068883419037, + "epoch": 7.651767845230154, + "step": 22940 + }, + { + "epoch": 7.651767845230154, + "ref_ce_loss": 0.030642185360193253, + "step": 22940 + }, + { + "epoch": 7.655103402268178, + "loss": 0.2989, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "grad_norm": 4.026222229003906, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "learning_rate": 1.4599149852246361e-06, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "loss": 0.5482908487319946, + "step": 22950 + }, + { + "ce_loss": 0.012102135457098484, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "distill_loss": 0.13015960156917572, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "ref_ce_loss": 0.050248291343450546, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "loss": 0.3930932581424713, + "step": 22950 + }, + { + "ce_loss": 0.032274968922138214, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "distill_loss": 0.19311977922916412, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "ref_ce_loss": 0.06739521026611328, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "loss": 0.5229855179786682, + "step": 22950 + }, + { + "ce_loss": 0.14910782873630524, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "distill_loss": 0.2856809198856354, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "ref_ce_loss": 0.08798830956220627, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "loss": 0.4588366448879242, + "step": 22950 + }, + { + "ce_loss": 0.04331749305129051, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "distill_loss": 0.3251328468322754, + "epoch": 7.655103402268178, + "step": 22950 + }, + { + "epoch": 7.655103402268178, + "ref_ce_loss": 0.050014346837997437, + "step": 22950 + }, + { + "epoch": 7.6584389593062046, + "loss": 0.3257, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "grad_norm": 2.500479221343994, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "learning_rate": 1.4318581275609754e-06, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "loss": 0.27828431129455566, + "step": 22960 + }, + { + "ce_loss": 0.019125934690237045, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "distill_loss": 0.16130469739437103, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "ref_ce_loss": 0.0303462203592062, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "loss": 0.27863505482673645, + "step": 22960 + }, + { + "ce_loss": 0.02259085513651371, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "distill_loss": 0.17387576401233673, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "ref_ce_loss": 0.05217726156115532, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "loss": 0.21874089539051056, + "step": 22960 + }, + { + "ce_loss": 0.014435866847634315, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "distill_loss": 0.13992641866207123, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "ref_ce_loss": 0.03417731821537018, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "loss": 0.30610159039497375, + "step": 22960 + }, + { + "ce_loss": 0.029911501333117485, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "distill_loss": 0.13687187433242798, + "epoch": 7.6584389593062046, + "step": 22960 + }, + { + "epoch": 7.6584389593062046, + "ref_ce_loss": 0.07079162448644638, + "step": 22960 + }, + { + "epoch": 7.661774516344229, + "loss": 0.3206, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "grad_norm": 3.2441189289093018, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "learning_rate": 1.4040721995827342e-06, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "loss": 0.30744755268096924, + "step": 22970 + }, + { + "ce_loss": 0.01652863807976246, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "distill_loss": 0.09780027717351913, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "ref_ce_loss": 0.04824933409690857, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "loss": 0.2842020094394684, + "step": 22970 + }, + { + "ce_loss": 0.026345144957304, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "distill_loss": 0.20223809778690338, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "ref_ce_loss": 0.05555181950330734, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "loss": 0.49395424127578735, + "step": 22970 + }, + { + "ce_loss": 0.0228904839605093, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "distill_loss": 0.2043241262435913, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "ref_ce_loss": 0.04845643788576126, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "loss": 0.2669130265712738, + "step": 22970 + }, + { + "ce_loss": 0.05929362401366234, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "distill_loss": 0.1629556119441986, + "epoch": 7.661774516344229, + "step": 22970 + }, + { + "epoch": 7.661774516344229, + "ref_ce_loss": 0.04456076771020889, + "step": 22970 + }, + { + "epoch": 7.665110073382255, + "loss": 0.3014, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "grad_norm": 3.482917070388794, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "learning_rate": 1.3765572519604806e-06, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "loss": 0.3842131793498993, + "step": 22980 + }, + { + "ce_loss": 0.02830090932548046, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "distill_loss": 0.20186612010002136, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "ref_ce_loss": 0.037983302026987076, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "loss": 0.34967049956321716, + "step": 22980 + }, + { + "ce_loss": 0.042643673717975616, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "distill_loss": 0.1744934767484665, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "ref_ce_loss": 0.05821855738759041, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "loss": 0.19610543549060822, + "step": 22980 + }, + { + "ce_loss": 0.029282161965966225, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "distill_loss": 0.12504170835018158, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "ref_ce_loss": 0.04139018431305885, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "loss": 0.345589816570282, + "step": 22980 + }, + { + "ce_loss": 0.03254815936088562, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "distill_loss": 0.2645869851112366, + "epoch": 7.665110073382255, + "step": 22980 + }, + { + "epoch": 7.665110073382255, + "ref_ce_loss": 0.04817293584346771, + "step": 22980 + }, + { + "epoch": 7.66844563042028, + "loss": 0.3194, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "grad_norm": 2.9004547595977783, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "learning_rate": 1.3493133348706442e-06, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "loss": 0.2697019875049591, + "step": 22990 + }, + { + "ce_loss": 0.05502264201641083, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "distill_loss": 0.15867778658866882, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "ref_ce_loss": 0.05582212656736374, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "loss": 0.3151005804538727, + "step": 22990 + }, + { + "ce_loss": 0.06821805983781815, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "distill_loss": 0.16759486496448517, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "ref_ce_loss": 0.059118907898664474, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "loss": 0.6699935793876648, + "step": 22990 + }, + { + "ce_loss": 0.020239880308508873, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "distill_loss": 0.1337202489376068, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "ref_ce_loss": 0.04875611513853073, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "loss": 0.43389075994491577, + "step": 22990 + }, + { + "ce_loss": 0.013298127800226212, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "distill_loss": 0.16998136043548584, + "epoch": 7.66844563042028, + "step": 22990 + }, + { + "epoch": 7.66844563042028, + "ref_ce_loss": 0.04834935814142227, + "step": 22990 + }, + { + "epoch": 7.671781187458306, + "loss": 0.3221, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "grad_norm": 5.315511226654053, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "learning_rate": 1.3223404979953834e-06, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "loss": 0.30739280581474304, + "step": 23000 + }, + { + "ce_loss": 0.01956716738641262, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "distill_loss": 0.21019117534160614, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "ref_ce_loss": 0.05563579127192497, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "loss": 0.2983548939228058, + "step": 23000 + }, + { + "ce_loss": 0.025492293760180473, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "distill_loss": 0.20712508261203766, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "ref_ce_loss": 0.044836029410362244, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "loss": 0.16253502666950226, + "step": 23000 + }, + { + "ce_loss": 0.015675950795412064, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "distill_loss": 0.0947803407907486, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "ref_ce_loss": 0.0519745759665966, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "loss": 0.792258620262146, + "step": 23000 + }, + { + "ce_loss": 0.02057049050927162, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "distill_loss": 0.26838886737823486, + "epoch": 7.671781187458306, + "step": 23000 + }, + { + "epoch": 7.671781187458306, + "ref_ce_loss": 0.04232442378997803, + "step": 23000 + }, + { + "epoch": 7.67511674449633, + "loss": 0.3352, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "grad_norm": 6.571993827819824, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "learning_rate": 1.2956387905225018e-06, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "loss": 0.2995764911174774, + "step": 23010 + }, + { + "ce_loss": 0.04184538498520851, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "distill_loss": 0.13831311464309692, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "ref_ce_loss": 0.05882956087589264, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "loss": 0.5072600245475769, + "step": 23010 + }, + { + "ce_loss": 0.03169850632548332, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "distill_loss": 0.13568931818008423, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "ref_ce_loss": 0.032809965312480927, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "loss": 0.4805125296115875, + "step": 23010 + }, + { + "ce_loss": 0.05861205235123634, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "distill_loss": 0.17325712740421295, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "ref_ce_loss": 0.09007645398378372, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "loss": 0.21181714534759521, + "step": 23010 + }, + { + "ce_loss": 0.02114477939903736, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "distill_loss": 0.1146874874830246, + "epoch": 7.67511674449633, + "step": 23010 + }, + { + "epoch": 7.67511674449633, + "ref_ce_loss": 0.04349588602781296, + "step": 23010 + }, + { + "epoch": 7.678452301534357, + "loss": 0.3521, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "grad_norm": 3.2868006229400635, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "learning_rate": 1.2692082611453825e-06, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "loss": 0.45636117458343506, + "step": 23020 + }, + { + "ce_loss": 0.047584645450115204, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "distill_loss": 0.3129931092262268, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "ref_ce_loss": 0.05031001567840576, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "loss": 0.43598026037216187, + "step": 23020 + }, + { + "ce_loss": 0.019500087946653366, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "distill_loss": 0.2599097192287445, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "ref_ce_loss": 0.0634281113743782, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "loss": 0.32026126980781555, + "step": 23020 + }, + { + "ce_loss": 0.025082221254706383, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "distill_loss": 0.18352770805358887, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "ref_ce_loss": 0.06303086876869202, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "loss": 0.3685997426509857, + "step": 23020 + }, + { + "ce_loss": 0.03256651759147644, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "distill_loss": 0.19047406315803528, + "epoch": 7.678452301534357, + "step": 23020 + }, + { + "epoch": 7.678452301534357, + "ref_ce_loss": 0.04560491442680359, + "step": 23020 + }, + { + "epoch": 7.681787858572381, + "loss": 0.3419, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "grad_norm": 3.9646894931793213, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "learning_rate": 1.2430489580628699e-06, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "loss": 0.3957512378692627, + "step": 23030 + }, + { + "ce_loss": 0.07101224362850189, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "distill_loss": 0.1624712198972702, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "ref_ce_loss": 0.07757756114006042, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "loss": 0.24718983471393585, + "step": 23030 + }, + { + "ce_loss": 0.02744249440729618, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "distill_loss": 0.13793756067752838, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "ref_ce_loss": 0.04307285323739052, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "loss": 0.5288066864013672, + "step": 23030 + }, + { + "ce_loss": 0.002461702097207308, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "distill_loss": 0.14066945016384125, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "ref_ce_loss": 0.037495292723178864, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "loss": 0.6094015836715698, + "step": 23030 + }, + { + "ce_loss": 0.047975365072488785, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "distill_loss": 0.20297566056251526, + "epoch": 7.681787858572381, + "step": 23030 + }, + { + "epoch": 7.681787858572381, + "ref_ce_loss": 0.07441578805446625, + "step": 23030 + }, + { + "epoch": 7.685123415610407, + "loss": 0.3273, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "grad_norm": 2.976410150527954, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "learning_rate": 1.2171609289792384e-06, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "loss": 0.3225436806678772, + "step": 23040 + }, + { + "ce_loss": 0.009163063950836658, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "distill_loss": 0.16070663928985596, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "ref_ce_loss": 0.053454380482435226, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "loss": 0.5536080598831177, + "step": 23040 + }, + { + "ce_loss": 0.05467384308576584, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "distill_loss": 0.2630139887332916, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "ref_ce_loss": 0.07442136108875275, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "loss": 0.24188895523548126, + "step": 23040 + }, + { + "ce_loss": 0.0048294877633452415, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "distill_loss": 0.11708530783653259, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "ref_ce_loss": 0.05692865327000618, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "loss": 0.30179455876350403, + "step": 23040 + }, + { + "ce_loss": 0.05799272656440735, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "distill_loss": 0.1916697472333908, + "epoch": 7.685123415610407, + "step": 23040 + }, + { + "epoch": 7.685123415610407, + "ref_ce_loss": 0.051968324929475784, + "step": 23040 + }, + { + "epoch": 7.688458972648432, + "loss": 0.3115, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "grad_norm": 4.0610222816467285, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "learning_rate": 1.1915442211040404e-06, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "loss": 0.3361358642578125, + "step": 23050 + }, + { + "ce_loss": 0.0818992331624031, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "distill_loss": 0.20039232075214386, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "ref_ce_loss": 0.05358118191361427, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "loss": 0.28487151861190796, + "step": 23050 + }, + { + "ce_loss": 0.009340011514723301, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "distill_loss": 0.15453654527664185, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "ref_ce_loss": 0.051413096487522125, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "loss": 0.34228986501693726, + "step": 23050 + }, + { + "ce_loss": 0.035694170743227005, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "distill_loss": 0.15729156136512756, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "ref_ce_loss": 0.04107082262635231, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "loss": 0.3071814775466919, + "step": 23050 + }, + { + "ce_loss": 0.035663481801748276, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "distill_loss": 0.2025829255580902, + "epoch": 7.688458972648432, + "step": 23050 + }, + { + "epoch": 7.688458972648432, + "ref_ce_loss": 0.03947122022509575, + "step": 23050 + }, + { + "epoch": 7.691794529686458, + "loss": 0.3508, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "grad_norm": 5.8116679191589355, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "learning_rate": 1.166198881152025e-06, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "loss": 0.2547825276851654, + "step": 23060 + }, + { + "ce_loss": 0.010203652083873749, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "distill_loss": 0.14617383480072021, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "ref_ce_loss": 0.056542348116636276, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "loss": 0.34109601378440857, + "step": 23060 + }, + { + "ce_loss": 0.014679406769573689, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "distill_loss": 0.24016183614730835, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "ref_ce_loss": 0.04968947917222977, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "loss": 0.20898187160491943, + "step": 23060 + }, + { + "ce_loss": 0.03771166130900383, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "distill_loss": 0.1140361949801445, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "ref_ce_loss": 0.03934255987405777, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "loss": 0.2710397243499756, + "step": 23060 + }, + { + "ce_loss": 0.005295777693390846, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "distill_loss": 0.1349426507949829, + "epoch": 7.691794529686458, + "step": 23060 + }, + { + "epoch": 7.691794529686458, + "ref_ce_loss": 0.04271606355905533, + "step": 23060 + }, + { + "epoch": 7.6951300867244825, + "loss": 0.3044, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "grad_norm": 3.171736478805542, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "learning_rate": 1.14112495534312e-06, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "loss": 0.2496335506439209, + "step": 23070 + }, + { + "ce_loss": 0.04608842357993126, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "distill_loss": 0.13245657086372375, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "ref_ce_loss": 0.049435753375291824, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "loss": 0.33521968126296997, + "step": 23070 + }, + { + "ce_loss": 0.03709598258137703, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "distill_loss": 0.22455614805221558, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "ref_ce_loss": 0.032426513731479645, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "loss": 0.5236812829971313, + "step": 23070 + }, + { + "ce_loss": 0.04154251515865326, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "distill_loss": 0.20564574003219604, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "ref_ce_loss": 0.0866141989827156, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "loss": 0.2064809799194336, + "step": 23070 + }, + { + "ce_loss": 0.011886204592883587, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "distill_loss": 0.11368348449468613, + "epoch": 7.6951300867244825, + "step": 23070 + }, + { + "epoch": 7.6951300867244825, + "ref_ce_loss": 0.05952262133359909, + "step": 23070 + }, + { + "epoch": 7.698465643762509, + "loss": 0.3111, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "grad_norm": 4.591440200805664, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "learning_rate": 1.116322489402266e-06, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "loss": 0.22700554132461548, + "step": 23080 + }, + { + "ce_loss": 0.029819414019584656, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "distill_loss": 0.121553435921669, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "ref_ce_loss": 0.056140560656785965, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "loss": 0.36817777156829834, + "step": 23080 + }, + { + "ce_loss": 0.03735628351569176, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "distill_loss": 0.19975467026233673, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "ref_ce_loss": 0.06319694221019745, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "loss": 0.21822097897529602, + "step": 23080 + }, + { + "ce_loss": 0.02268623188138008, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "distill_loss": 0.13283170759677887, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "ref_ce_loss": 0.06267096102237701, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "loss": 0.26581940054893494, + "step": 23080 + }, + { + "ce_loss": 0.029165010899305344, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "distill_loss": 0.155491441488266, + "epoch": 7.698465643762509, + "step": 23080 + }, + { + "epoch": 7.698465643762509, + "ref_ce_loss": 0.049425188452005386, + "step": 23080 + }, + { + "epoch": 7.701801200800533, + "loss": 0.3056, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "grad_norm": 2.542473554611206, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "learning_rate": 1.091791528559366e-06, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "loss": 0.19253060221672058, + "step": 23090 + }, + { + "ce_loss": 0.0050561269745230675, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "distill_loss": 0.14840315282344818, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "ref_ce_loss": 0.027028419077396393, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "loss": 0.3827259838581085, + "step": 23090 + }, + { + "ce_loss": 0.02695002406835556, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "distill_loss": 0.16917379200458527, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "ref_ce_loss": 0.06104246899485588, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "loss": 0.9057022333145142, + "step": 23090 + }, + { + "ce_loss": 0.030367007479071617, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "distill_loss": 0.1319291591644287, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "ref_ce_loss": 0.03348112851381302, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "loss": 0.5137628316879272, + "step": 23090 + }, + { + "ce_loss": 0.060257963836193085, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "distill_loss": 0.31021299958229065, + "epoch": 7.701801200800533, + "step": 23090 + }, + { + "epoch": 7.701801200800533, + "ref_ce_loss": 0.07672536373138428, + "step": 23090 + }, + { + "epoch": 7.7051367578385594, + "loss": 0.3217, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "grad_norm": 2.8845362663269043, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "learning_rate": 1.0675321175492025e-06, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "loss": 0.20346535742282867, + "step": 23100 + }, + { + "ce_loss": 0.007688038982450962, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "distill_loss": 0.15419723093509674, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "ref_ce_loss": 0.04151067137718201, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "loss": 0.16337968409061432, + "step": 23100 + }, + { + "ce_loss": 0.002642042702063918, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "distill_loss": 0.11293807625770569, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "ref_ce_loss": 0.0326145775616169, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "loss": 0.4473174512386322, + "step": 23100 + }, + { + "ce_loss": 0.038927558809518814, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "distill_loss": 0.12315823137760162, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "ref_ce_loss": 0.04285747930407524, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "loss": 0.3779004216194153, + "step": 23100 + }, + { + "ce_loss": 0.054333869367837906, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "distill_loss": 0.1537066102027893, + "epoch": 7.7051367578385594, + "step": 23100 + }, + { + "epoch": 7.7051367578385594, + "ref_ce_loss": 0.04151177778840065, + "step": 23100 + }, + { + "epoch": 7.708472314876584, + "loss": 0.3035, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "grad_norm": 3.3525681495666504, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "learning_rate": 1.0435443006114208e-06, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "loss": 0.44415926933288574, + "step": 23110 + }, + { + "ce_loss": 0.02013283036649227, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "distill_loss": 0.21743977069854736, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "ref_ce_loss": 0.07050774246454239, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "loss": 0.32459118962287903, + "step": 23110 + }, + { + "ce_loss": 0.031134208664298058, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "distill_loss": 0.13974878191947937, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "ref_ce_loss": 0.05988594517111778, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "loss": 0.3135908246040344, + "step": 23110 + }, + { + "ce_loss": 0.028349503874778748, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "distill_loss": 0.20856799185276031, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "ref_ce_loss": 0.05278262495994568, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "loss": 0.18645304441452026, + "step": 23110 + }, + { + "ce_loss": 0.012953277677297592, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "distill_loss": 0.1352105736732483, + "epoch": 7.708472314876584, + "step": 23110 + }, + { + "epoch": 7.708472314876584, + "ref_ce_loss": 0.03790950030088425, + "step": 23110 + }, + { + "epoch": 7.71180787191461, + "loss": 0.3338, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "grad_norm": 3.6223158836364746, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "learning_rate": 1.019828121490296e-06, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "loss": 0.22437624633312225, + "step": 23120 + }, + { + "ce_loss": 0.05144285783171654, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "distill_loss": 0.13139377534389496, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "ref_ce_loss": 0.04147119075059891, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "loss": 0.4318474531173706, + "step": 23120 + }, + { + "ce_loss": 0.09833890199661255, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "distill_loss": 0.28609535098075867, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "ref_ce_loss": 0.04700387641787529, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "loss": 0.159266397356987, + "step": 23120 + }, + { + "ce_loss": 0.006263875402510166, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "distill_loss": 0.09080880135297775, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "ref_ce_loss": 0.04199573025107384, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "loss": 0.2531209886074066, + "step": 23120 + }, + { + "ce_loss": 0.0459630973637104, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "distill_loss": 0.09766028076410294, + "epoch": 7.71180787191461, + "step": 23120 + }, + { + "epoch": 7.71180787191461, + "ref_ce_loss": 0.04684332385659218, + "step": 23120 + }, + { + "epoch": 7.715143428952635, + "loss": 0.3347, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "grad_norm": 3.626303195953369, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "learning_rate": 9.963836234347988e-07, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "loss": 0.31000179052352905, + "step": 23130 + }, + { + "ce_loss": 0.009761415421962738, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "distill_loss": 0.11593847721815109, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "ref_ce_loss": 0.060613133013248444, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "loss": 0.23310451209545135, + "step": 23130 + }, + { + "ce_loss": 0.018389590084552765, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "distill_loss": 0.17988187074661255, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "ref_ce_loss": 0.02529945969581604, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "loss": 0.18636879324913025, + "step": 23130 + }, + { + "ce_loss": 0.019744787365198135, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "distill_loss": 0.10091696679592133, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "ref_ce_loss": 0.04688052833080292, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "loss": 0.44403672218322754, + "step": 23130 + }, + { + "ce_loss": 0.018276330083608627, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "distill_loss": 0.12965573370456696, + "epoch": 7.715143428952635, + "step": 23130 + }, + { + "epoch": 7.715143428952635, + "ref_ce_loss": 0.02683100290596485, + "step": 23130 + }, + { + "epoch": 7.718478985990661, + "loss": 0.3175, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "grad_norm": 3.112769842147827, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "learning_rate": 9.73210849198447e-07, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "loss": 0.1915590912103653, + "step": 23140 + }, + { + "ce_loss": 0.02874593995511532, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "distill_loss": 0.11782795190811157, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "ref_ce_loss": 0.04489807412028313, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "loss": 0.39852508902549744, + "step": 23140 + }, + { + "ce_loss": 0.045446548610925674, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "distill_loss": 0.2548648416996002, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "ref_ce_loss": 0.07269423454999924, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "loss": 0.30995240807533264, + "step": 23140 + }, + { + "ce_loss": 0.06380195170640945, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "distill_loss": 0.13276419043540955, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "ref_ce_loss": 0.06999576836824417, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "loss": 0.14055275917053223, + "step": 23140 + }, + { + "ce_loss": 0.022481508553028107, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "distill_loss": 0.09223896265029907, + "epoch": 7.718478985990661, + "step": 23140 + }, + { + "epoch": 7.718478985990661, + "ref_ce_loss": 0.02576570026576519, + "step": 23140 + }, + { + "epoch": 7.721814543028685, + "loss": 0.3163, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "grad_norm": 4.643634796142578, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "learning_rate": 9.503098410392207e-07, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "loss": 0.31578657031059265, + "step": 23150 + }, + { + "ce_loss": 0.014320285059511662, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "distill_loss": 0.18919619917869568, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "ref_ce_loss": 0.0538095161318779, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "loss": 0.3112615942955017, + "step": 23150 + }, + { + "ce_loss": 0.04894857853651047, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "distill_loss": 0.1551634967327118, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "ref_ce_loss": 0.028474921360611916, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "loss": 0.31689026951789856, + "step": 23150 + }, + { + "ce_loss": 0.011591101996600628, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "distill_loss": 0.1778896301984787, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "ref_ce_loss": 0.0530037060379982, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "loss": 0.21250712871551514, + "step": 23150 + }, + { + "ce_loss": 0.01693120412528515, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "distill_loss": 0.1388123333454132, + "epoch": 7.721814543028685, + "step": 23150 + }, + { + "epoch": 7.721814543028685, + "ref_ce_loss": 0.05666331201791763, + "step": 23150 + }, + { + "epoch": 7.7251501000667115, + "loss": 0.3276, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "grad_norm": 2.5653464794158936, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "learning_rate": 9.27680640719547e-07, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "loss": 0.20201954245567322, + "step": 23160 + }, + { + "ce_loss": 0.01078097615391016, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "distill_loss": 0.09477313607931137, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "ref_ce_loss": 0.03961151838302612, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "loss": 0.3385741114616394, + "step": 23160 + }, + { + "ce_loss": 0.020411452278494835, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "distill_loss": 0.13669410347938538, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "ref_ce_loss": 0.06803715229034424, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "loss": 0.4764685034751892, + "step": 23160 + }, + { + "ce_loss": 0.028775783255696297, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "distill_loss": 0.214456707239151, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "ref_ce_loss": 0.07601587474346161, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "loss": 0.4041188955307007, + "step": 23160 + }, + { + "ce_loss": 0.09439730644226074, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "distill_loss": 0.1673288494348526, + "epoch": 7.7251501000667115, + "step": 23160 + }, + { + "epoch": 7.7251501000667115, + "ref_ce_loss": 0.06416967511177063, + "step": 23160 + }, + { + "epoch": 7.728485657104736, + "loss": 0.3091, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "grad_norm": 2.889651298522949, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "learning_rate": 9.053232895061657e-07, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "loss": 0.5144741535186768, + "step": 23170 + }, + { + "ce_loss": 0.020566776394844055, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "distill_loss": 0.3035084009170532, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "ref_ce_loss": 0.062379274517297745, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "loss": 0.23927778005599976, + "step": 23170 + }, + { + "ce_loss": 0.012099779210984707, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "distill_loss": 0.13171517848968506, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "ref_ce_loss": 0.06338847428560257, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "loss": 0.20670422911643982, + "step": 23170 + }, + { + "ce_loss": 0.01207758579403162, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "distill_loss": 0.12016580253839493, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "ref_ce_loss": 0.031029202044010162, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "loss": 0.4285022020339966, + "step": 23170 + }, + { + "ce_loss": 0.06939056515693665, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "distill_loss": 0.2485809475183487, + "epoch": 7.728485657104736, + "step": 23170 + }, + { + "epoch": 7.728485657104736, + "ref_ce_loss": 0.048880282789468765, + "step": 23170 + }, + { + "epoch": 7.731821214142762, + "loss": 0.3049, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "grad_norm": 3.228677749633789, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "learning_rate": 8.832378281700303e-07, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "loss": 0.17703618109226227, + "step": 23180 + }, + { + "ce_loss": 0.013402396813035011, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "distill_loss": 0.10235219448804855, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "ref_ce_loss": 0.06116965785622597, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "loss": 0.31809985637664795, + "step": 23180 + }, + { + "ce_loss": 0.023722652345895767, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "distill_loss": 0.11942629516124725, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "ref_ce_loss": 0.049432169646024704, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "loss": 0.23215220868587494, + "step": 23180 + }, + { + "ce_loss": 0.05213412642478943, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "distill_loss": 0.1340891718864441, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "ref_ce_loss": 0.03527505323290825, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "loss": 0.2313213050365448, + "step": 23180 + }, + { + "ce_loss": 0.03938008099794388, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "distill_loss": 0.15864311158657074, + "epoch": 7.731821214142762, + "step": 23180 + }, + { + "epoch": 7.731821214142762, + "ref_ce_loss": 0.0247786957770586, + "step": 23180 + }, + { + "epoch": 7.735156771180787, + "loss": 0.3335, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "grad_norm": 3.1714279651641846, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "learning_rate": 8.614242969863572e-07, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "loss": 0.23594240844249725, + "step": 23190 + }, + { + "ce_loss": 0.02036680467426777, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "distill_loss": 0.15960662066936493, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "ref_ce_loss": 0.05590438097715378, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "loss": 0.3538329601287842, + "step": 23190 + }, + { + "ce_loss": 0.08600302040576935, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "distill_loss": 0.19357657432556152, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "ref_ce_loss": 0.06092427670955658, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "loss": 0.4086431860923767, + "step": 23190 + }, + { + "ce_loss": 0.036485668271780014, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "distill_loss": 0.13172754645347595, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "ref_ce_loss": 0.05472996085882187, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "loss": 0.29737913608551025, + "step": 23190 + }, + { + "ce_loss": 0.025702279061079025, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "distill_loss": 0.1611969769001007, + "epoch": 7.735156771180787, + "step": 23190 + }, + { + "epoch": 7.735156771180787, + "ref_ce_loss": 0.0548030324280262, + "step": 23190 + }, + { + "epoch": 7.738492328218813, + "loss": 0.3057, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "grad_norm": 3.2966296672821045, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "learning_rate": 8.398827357343929e-07, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "loss": 0.3330060839653015, + "step": 23200 + }, + { + "ce_loss": 0.042179033160209656, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "distill_loss": 0.13909977674484253, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "ref_ce_loss": 0.07637790590524673, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "loss": 0.3866680860519409, + "step": 23200 + }, + { + "ce_loss": 0.0406869538128376, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "distill_loss": 0.14271864295005798, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "ref_ce_loss": 0.045196447521448135, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "loss": 0.40931081771850586, + "step": 23200 + }, + { + "ce_loss": 0.0653044730424881, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "distill_loss": 0.10990387946367264, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "ref_ce_loss": 0.06856023520231247, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "loss": 0.25872522592544556, + "step": 23200 + }, + { + "ce_loss": 0.025372039526700974, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "distill_loss": 0.1737333983182907, + "epoch": 7.738492328218813, + "step": 23200 + }, + { + "epoch": 7.738492328218813, + "ref_ce_loss": 0.0390053316950798, + "step": 23200 + }, + { + "epoch": 7.741827885256837, + "loss": 0.3241, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "grad_norm": 2.839111566543579, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "learning_rate": 8.186131836974474e-07, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "loss": 0.2935517132282257, + "step": 23210 + }, + { + "ce_loss": 0.06324876844882965, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "distill_loss": 0.14696261286735535, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "ref_ce_loss": 0.0566866435110569, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "loss": 0.5052548050880432, + "step": 23210 + }, + { + "ce_loss": 0.012156311422586441, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "distill_loss": 0.23311343789100647, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "ref_ce_loss": 0.06416646391153336, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "loss": 0.18498748540878296, + "step": 23210 + }, + { + "ce_loss": 0.0010739491553977132, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "distill_loss": 0.09309439361095428, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "ref_ce_loss": 0.029653212055563927, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "loss": 0.24336495995521545, + "step": 23210 + }, + { + "ce_loss": 0.05524908006191254, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "distill_loss": 0.13438133895397186, + "epoch": 7.741827885256837, + "step": 23210 + }, + { + "epoch": 7.741827885256837, + "ref_ce_loss": 0.039194490760564804, + "step": 23210 + }, + { + "epoch": 7.745163442294864, + "loss": 0.295, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "grad_norm": 2.449200391769409, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "learning_rate": 7.976156796627942e-07, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "loss": 0.18167901039123535, + "step": 23220 + }, + { + "ce_loss": 0.031659264117479324, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "distill_loss": 0.12303122133016586, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "ref_ce_loss": 0.02684379555284977, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "loss": 0.19133666157722473, + "step": 23220 + }, + { + "ce_loss": 0.040329594165086746, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "distill_loss": 0.1013164073228836, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "ref_ce_loss": 0.04963481053709984, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "loss": 0.37134993076324463, + "step": 23220 + }, + { + "ce_loss": 0.01046949066221714, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "distill_loss": 0.2826530933380127, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "ref_ce_loss": 0.05680123716592789, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "loss": 0.35854285955429077, + "step": 23220 + }, + { + "ce_loss": 0.050170235335826874, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "distill_loss": 0.12987568974494934, + "epoch": 7.745163442294864, + "step": 23220 + }, + { + "epoch": 7.745163442294864, + "ref_ce_loss": 0.02719688042998314, + "step": 23220 + }, + { + "epoch": 7.748498999332888, + "loss": 0.3107, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "grad_norm": 6.194815635681152, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "learning_rate": 7.7689026192162e-07, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "loss": 0.28531792759895325, + "step": 23230 + }, + { + "ce_loss": 0.018100492656230927, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "distill_loss": 0.18027476966381073, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "ref_ce_loss": 0.04737915098667145, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "loss": 0.4881407618522644, + "step": 23230 + }, + { + "ce_loss": 0.0619981475174427, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "distill_loss": 0.1511438935995102, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "ref_ce_loss": 0.04644491523504257, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "loss": 0.33561399579048157, + "step": 23230 + }, + { + "ce_loss": 0.011638162657618523, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "distill_loss": 0.24499163031578064, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "ref_ce_loss": 0.07888083159923553, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "loss": 0.4013439416885376, + "step": 23230 + }, + { + "ce_loss": 0.07862678915262222, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "distill_loss": 0.2322043627500534, + "epoch": 7.748498999332888, + "step": 23230 + }, + { + "epoch": 7.748498999332888, + "ref_ce_loss": 0.06721118837594986, + "step": 23230 + }, + { + "epoch": 7.751834556370914, + "loss": 0.347, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "grad_norm": 2.814399003982544, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "learning_rate": 7.564369682688754e-07, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "loss": 0.19102168083190918, + "step": 23240 + }, + { + "ce_loss": 0.0067435563541948795, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "distill_loss": 0.12526308000087738, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "ref_ce_loss": 0.058875057846307755, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "loss": 0.2656038701534271, + "step": 23240 + }, + { + "ce_loss": 0.014099403284490108, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "distill_loss": 0.14388953149318695, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "ref_ce_loss": 0.04784628748893738, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "loss": 0.718388557434082, + "step": 23240 + }, + { + "ce_loss": 0.01962624490261078, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "distill_loss": 0.26039832830429077, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "ref_ce_loss": 0.05587448552250862, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "loss": 0.28665295243263245, + "step": 23240 + }, + { + "ce_loss": 0.031991392374038696, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "distill_loss": 0.1633574366569519, + "epoch": 7.751834556370914, + "step": 23240 + }, + { + "epoch": 7.751834556370914, + "ref_ce_loss": 0.042982008308172226, + "step": 23240 + }, + { + "epoch": 7.755170113408939, + "loss": 0.3545, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "grad_norm": 3.5545544624328613, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "learning_rate": 7.362558360033411e-07, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "loss": 0.22462019324302673, + "step": 23250 + }, + { + "ce_loss": 0.030002467334270477, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "distill_loss": 0.1342082917690277, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "ref_ce_loss": 0.037484072148799896, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "loss": 0.2946101427078247, + "step": 23250 + }, + { + "ce_loss": 0.016634244471788406, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "distill_loss": 0.21034982800483704, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "ref_ce_loss": 0.06742505729198456, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "loss": 0.2736393213272095, + "step": 23250 + }, + { + "ce_loss": 0.014051617123186588, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "distill_loss": 0.17900004982948303, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "ref_ce_loss": 0.03179965540766716, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "loss": 0.3302898108959198, + "step": 23250 + }, + { + "ce_loss": 0.02950301021337509, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "distill_loss": 0.1390431523323059, + "epoch": 7.755170113408939, + "step": 23250 + }, + { + "epoch": 7.755170113408939, + "ref_ce_loss": 0.03176158294081688, + "step": 23250 + }, + { + "epoch": 7.758505670446965, + "loss": 0.3264, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "grad_norm": 3.3747599124908447, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "learning_rate": 7.163469019274115e-07, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "loss": 0.28049081563949585, + "step": 23260 + }, + { + "ce_loss": 0.050241705030202866, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "distill_loss": 0.1570914089679718, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "ref_ce_loss": 0.041322261095047, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "loss": 0.46387675404548645, + "step": 23260 + }, + { + "ce_loss": 0.04370967298746109, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "distill_loss": 0.2979971468448639, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "ref_ce_loss": 0.06697692722082138, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "loss": 0.32402563095092773, + "step": 23260 + }, + { + "ce_loss": 0.023801935836672783, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "distill_loss": 0.15474510192871094, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "ref_ce_loss": 0.05275455862283707, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "loss": 0.40194663405418396, + "step": 23260 + }, + { + "ce_loss": 0.02307719551026821, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "distill_loss": 0.09973050653934479, + "epoch": 7.758505670446965, + "step": 23260 + }, + { + "epoch": 7.758505670446965, + "ref_ce_loss": 0.052247051149606705, + "step": 23260 + }, + { + "epoch": 7.7618412274849895, + "loss": 0.3557, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "grad_norm": 3.8513638973236084, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "learning_rate": 6.967102023471283e-07, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "loss": 0.12965351343154907, + "step": 23270 + }, + { + "ce_loss": 0.0026622305158525705, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "distill_loss": 0.08537213504314423, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "ref_ce_loss": 0.0415009967982769, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "loss": 0.1693611890077591, + "step": 23270 + }, + { + "ce_loss": 0.008908931165933609, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "distill_loss": 0.10009366273880005, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "ref_ce_loss": 0.045529551804065704, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "loss": 0.3749549686908722, + "step": 23270 + }, + { + "ce_loss": 0.08394438028335571, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "distill_loss": 0.11877831071615219, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "ref_ce_loss": 0.08435434848070145, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "loss": 0.25630539655685425, + "step": 23270 + }, + { + "ce_loss": 0.032179322093725204, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "distill_loss": 0.14230948686599731, + "epoch": 7.7618412274849895, + "step": 23270 + }, + { + "epoch": 7.7618412274849895, + "ref_ce_loss": 0.05302827060222626, + "step": 23270 + }, + { + "epoch": 7.765176784523016, + "loss": 0.3329, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "grad_norm": 4.037125110626221, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "learning_rate": 6.773457730720966e-07, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "loss": 0.24391396343708038, + "step": 23280 + }, + { + "ce_loss": 0.007923566736280918, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "distill_loss": 0.14070750772953033, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "ref_ce_loss": 0.06303049623966217, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "loss": 0.3306331932544708, + "step": 23280 + }, + { + "ce_loss": 0.10012779384851456, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "distill_loss": 0.13132646679878235, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "ref_ce_loss": 0.06317123770713806, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "loss": 0.30709633231163025, + "step": 23280 + }, + { + "ce_loss": 0.01032828539609909, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "distill_loss": 0.20121553540229797, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "ref_ce_loss": 0.050020549446344376, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "loss": 0.2119283825159073, + "step": 23280 + }, + { + "ce_loss": 0.010647183284163475, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "distill_loss": 0.15966114401817322, + "epoch": 7.765176784523016, + "step": 23280 + }, + { + "epoch": 7.765176784523016, + "ref_ce_loss": 0.041549064218997955, + "step": 23280 + }, + { + "epoch": 7.76851234156104, + "loss": 0.338, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "grad_norm": 4.431809902191162, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "learning_rate": 6.582536494154022e-07, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "loss": 0.2826223373413086, + "step": 23290 + }, + { + "ce_loss": 0.018869668245315552, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "distill_loss": 0.21956700086593628, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "ref_ce_loss": 0.04408290982246399, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "loss": 0.22245214879512787, + "step": 23290 + }, + { + "ce_loss": 0.02188754640519619, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "distill_loss": 0.12243182957172394, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "ref_ce_loss": 0.045550353825092316, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "loss": 0.24546176195144653, + "step": 23290 + }, + { + "ce_loss": 0.04683222249150276, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "distill_loss": 0.13283726572990417, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "ref_ce_loss": 0.04206204414367676, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "loss": 0.16294394433498383, + "step": 23290 + }, + { + "ce_loss": 0.008387603797018528, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "distill_loss": 0.08378133177757263, + "epoch": 7.76851234156104, + "step": 23290 + }, + { + "epoch": 7.76851234156104, + "ref_ce_loss": 0.033749647438526154, + "step": 23290 + }, + { + "epoch": 7.771847898599066, + "loss": 0.3135, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "grad_norm": 3.9904065132141113, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "learning_rate": 6.39433866193545e-07, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "loss": 0.6241667866706848, + "step": 23300 + }, + { + "ce_loss": 0.024339929223060608, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "distill_loss": 0.19311808049678802, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "ref_ce_loss": 0.05117562785744667, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "loss": 0.2056247889995575, + "step": 23300 + }, + { + "ce_loss": 0.014999239705502987, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "distill_loss": 0.13496920466423035, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "ref_ce_loss": 0.05555931106209755, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "loss": 0.4284008741378784, + "step": 23300 + }, + { + "ce_loss": 0.03497346118092537, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "distill_loss": 0.23885288834571838, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "ref_ce_loss": 0.050635844469070435, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "loss": 0.34450194239616394, + "step": 23300 + }, + { + "ce_loss": 0.056762758642435074, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "distill_loss": 0.17033667862415314, + "epoch": 7.771847898599066, + "step": 23300 + }, + { + "epoch": 7.771847898599066, + "ref_ce_loss": 0.0646563395857811, + "step": 23300 + }, + { + "epoch": 7.775183455637091, + "loss": 0.3266, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "grad_norm": 3.700901746749878, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "learning_rate": 6.208864577263717e-07, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "loss": 0.31795617938041687, + "step": 23310 + }, + { + "ce_loss": 0.023845195770263672, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "distill_loss": 0.1281346082687378, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "ref_ce_loss": 0.04168390855193138, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "loss": 0.2685645818710327, + "step": 23310 + }, + { + "ce_loss": 0.030331619083881378, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "distill_loss": 0.1365409791469574, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "ref_ce_loss": 0.0594908744096756, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "loss": 0.24335750937461853, + "step": 23310 + }, + { + "ce_loss": 0.05650734901428223, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "distill_loss": 0.1242005005478859, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "ref_ce_loss": 0.04149289056658745, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "loss": 0.3003118634223938, + "step": 23310 + }, + { + "ce_loss": 0.020960014313459396, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "distill_loss": 0.21566730737686157, + "epoch": 7.775183455637091, + "step": 23310 + }, + { + "epoch": 7.775183455637091, + "ref_ce_loss": 0.047933224588632584, + "step": 23310 + }, + { + "epoch": 7.778519012675117, + "loss": 0.3369, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "grad_norm": 2.937605857849121, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "learning_rate": 6.026114578370434e-07, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "loss": 0.25859567523002625, + "step": 23320 + }, + { + "ce_loss": 0.028528448194265366, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "distill_loss": 0.1434447169303894, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "ref_ce_loss": 0.045929595828056335, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "loss": 0.6326279044151306, + "step": 23320 + }, + { + "ce_loss": 0.017608165740966797, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "distill_loss": 0.13123691082000732, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "ref_ce_loss": 0.04305972158908844, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "loss": 0.21977195143699646, + "step": 23320 + }, + { + "ce_loss": 0.031461816281080246, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "distill_loss": 0.14010189473628998, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "ref_ce_loss": 0.03188446909189224, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "loss": 0.25341397523880005, + "step": 23320 + }, + { + "ce_loss": 0.02488807961344719, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "distill_loss": 0.15948396921157837, + "epoch": 7.778519012675117, + "step": 23320 + }, + { + "epoch": 7.778519012675117, + "ref_ce_loss": 0.06859219819307327, + "step": 23320 + }, + { + "epoch": 7.781854569713142, + "loss": 0.2975, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "grad_norm": 2.587712049484253, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "learning_rate": 5.846088998519683e-07, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "loss": 0.3860955238342285, + "step": 23330 + }, + { + "ce_loss": 0.009484180249273777, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "distill_loss": 0.13444848358631134, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "ref_ce_loss": 0.03321141377091408, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "loss": 0.4119266867637634, + "step": 23330 + }, + { + "ce_loss": 0.0233280248939991, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "distill_loss": 0.2424059808254242, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "ref_ce_loss": 0.06320960074663162, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "loss": 0.2660433351993561, + "step": 23330 + }, + { + "ce_loss": 0.06393882632255554, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "distill_loss": 0.14668019115924835, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "ref_ce_loss": 0.055365853011608124, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "loss": 0.27549660205841064, + "step": 23330 + }, + { + "ce_loss": 0.0275224968791008, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "distill_loss": 0.18359704315662384, + "epoch": 7.781854569713142, + "step": 23330 + }, + { + "epoch": 7.781854569713142, + "ref_ce_loss": 0.02890479750931263, + "step": 23330 + }, + { + "epoch": 7.785190126751168, + "loss": 0.2935, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "grad_norm": 3.114147663116455, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "learning_rate": 5.668788166006854e-07, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "loss": 0.2866002917289734, + "step": 23340 + }, + { + "ce_loss": 0.008112505078315735, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "distill_loss": 0.16037411987781525, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "ref_ce_loss": 0.04638290032744408, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "loss": 0.2657839059829712, + "step": 23340 + }, + { + "ce_loss": 0.0471295528113842, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "distill_loss": 0.13989612460136414, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "ref_ce_loss": 0.045277271419763565, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "loss": 0.3349273204803467, + "step": 23340 + }, + { + "ce_loss": 0.037495408207178116, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "distill_loss": 0.20219463109970093, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "ref_ce_loss": 0.07002187520265579, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "loss": 0.46111318469047546, + "step": 23340 + }, + { + "ce_loss": 0.05854769051074982, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "distill_loss": 0.28044867515563965, + "epoch": 7.785190126751168, + "step": 23340 + }, + { + "epoch": 7.785190126751168, + "ref_ce_loss": 0.07100299000740051, + "step": 23340 + }, + { + "epoch": 7.788525683789192, + "loss": 0.3275, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "grad_norm": 3.6980140209198, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "learning_rate": 5.494212404158982e-07, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "loss": 0.25768497586250305, + "step": 23350 + }, + { + "ce_loss": 0.011918849311769009, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "distill_loss": 0.1634317934513092, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "ref_ce_loss": 0.043155863881111145, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "loss": 0.216793954372406, + "step": 23350 + }, + { + "ce_loss": 0.02071814425289631, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "distill_loss": 0.14459185302257538, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "ref_ce_loss": 0.05123360455036163, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "loss": 0.1484888792037964, + "step": 23350 + }, + { + "ce_loss": 0.010345513932406902, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "distill_loss": 0.08661268651485443, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "ref_ce_loss": 0.03675112500786781, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "loss": 0.205137237906456, + "step": 23350 + }, + { + "ce_loss": 0.04296492040157318, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "distill_loss": 0.11191177368164062, + "epoch": 7.788525683789192, + "step": 23350 + }, + { + "epoch": 7.788525683789192, + "ref_ce_loss": 0.05003888159990311, + "step": 23350 + }, + { + "epoch": 7.7918612408272185, + "loss": 0.3069, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "grad_norm": 2.725006580352783, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "learning_rate": 5.322362031333238e-07, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "loss": 0.4058573842048645, + "step": 23360 + }, + { + "ce_loss": 0.03540191799402237, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "distill_loss": 0.11767835170030594, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "ref_ce_loss": 0.029293276369571686, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "loss": 0.4584265351295471, + "step": 23360 + }, + { + "ce_loss": 0.0269757267087698, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "distill_loss": 0.2049024999141693, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "ref_ce_loss": 0.048592083156108856, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "loss": 0.3664950728416443, + "step": 23360 + }, + { + "ce_loss": 0.0457327701151371, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "distill_loss": 0.2076897770166397, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "ref_ce_loss": 0.0358634777367115, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "loss": 0.1988409459590912, + "step": 23360 + }, + { + "ce_loss": 0.013353724963963032, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "distill_loss": 0.14962157607078552, + "epoch": 7.7918612408272185, + "step": 23360 + }, + { + "epoch": 7.7918612408272185, + "ref_ce_loss": 0.035612523555755615, + "step": 23360 + }, + { + "epoch": 7.795196797865243, + "loss": 0.3187, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "grad_norm": 3.3891184329986572, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "learning_rate": 5.153237360916773e-07, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "loss": 0.5488206744194031, + "step": 23370 + }, + { + "ce_loss": 0.10254282504320145, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "distill_loss": 0.378812313079834, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "ref_ce_loss": 0.04811196029186249, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "loss": 0.4255595803260803, + "step": 23370 + }, + { + "ce_loss": 0.03799142688512802, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "distill_loss": 0.2826904058456421, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "ref_ce_loss": 0.04478185623884201, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "loss": 0.2226969301700592, + "step": 23370 + }, + { + "ce_loss": 0.021655146032571793, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "distill_loss": 0.1437462568283081, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "ref_ce_loss": 0.046798039227724075, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "loss": 0.48034214973449707, + "step": 23370 + }, + { + "ce_loss": 0.046554792672395706, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "distill_loss": 0.3787533640861511, + "epoch": 7.795196797865243, + "step": 23370 + }, + { + "epoch": 7.795196797865243, + "ref_ce_loss": 0.05498090758919716, + "step": 23370 + }, + { + "epoch": 7.798532354903269, + "loss": 0.3151, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "grad_norm": 4.284021854400635, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "learning_rate": 4.986838701326545e-07, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "loss": 0.3707292079925537, + "step": 23380 + }, + { + "ce_loss": 0.09322760999202728, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "distill_loss": 0.21029648184776306, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "ref_ce_loss": 0.05152899771928787, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "loss": 0.3132014870643616, + "step": 23380 + }, + { + "ce_loss": 0.04309116676449776, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "distill_loss": 0.14849551022052765, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "ref_ce_loss": 0.05384783446788788, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "loss": 0.2031479924917221, + "step": 23380 + }, + { + "ce_loss": 0.010098463855683804, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "distill_loss": 0.1490442454814911, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "ref_ce_loss": 0.029072962701320648, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "loss": 0.23375585675239563, + "step": 23380 + }, + { + "ce_loss": 0.013286544941365719, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "distill_loss": 0.10644297301769257, + "epoch": 7.798532354903269, + "step": 23380 + }, + { + "epoch": 7.798532354903269, + "ref_ce_loss": 0.02351474016904831, + "step": 23380 + }, + { + "epoch": 7.801867911941294, + "loss": 0.3135, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "grad_norm": 3.123403310775757, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "learning_rate": 4.82316635600799e-07, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "loss": 0.2821028530597687, + "step": 23390 + }, + { + "ce_loss": 0.019099919125437737, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "distill_loss": 0.17505885660648346, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "ref_ce_loss": 0.06228472664952278, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "loss": 0.18802164494991302, + "step": 23390 + }, + { + "ce_loss": 0.009978541173040867, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "distill_loss": 0.11921600997447968, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "ref_ce_loss": 0.03513723239302635, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "loss": 0.45474499464035034, + "step": 23390 + }, + { + "ce_loss": 0.07498273998498917, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "distill_loss": 0.19355358183383942, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "ref_ce_loss": 0.07677411288022995, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "loss": 0.270125150680542, + "step": 23390 + }, + { + "ce_loss": 0.06720106303691864, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "distill_loss": 0.13926897943019867, + "epoch": 7.801867911941294, + "step": 23390 + }, + { + "epoch": 7.801867911941294, + "ref_ce_loss": 0.038092099130153656, + "step": 23390 + }, + { + "epoch": 7.80520346897932, + "loss": 0.3264, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "grad_norm": 2.53342604637146, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "learning_rate": 4.662220623434854e-07, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "loss": 0.3736017942428589, + "step": 23400 + }, + { + "ce_loss": 0.02059594914317131, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "distill_loss": 0.14803819358348846, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "ref_ce_loss": 0.06236787885427475, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "loss": 0.34300127625465393, + "step": 23400 + }, + { + "ce_loss": 0.07793295383453369, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "distill_loss": 0.13648638129234314, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "ref_ce_loss": 0.04996887966990471, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "loss": 0.27772849798202515, + "step": 23400 + }, + { + "ce_loss": 0.0037580865900963545, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "distill_loss": 0.2001986801624298, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "ref_ce_loss": 0.043960537761449814, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "loss": 0.2673993706703186, + "step": 23400 + }, + { + "ce_loss": 0.052981454879045486, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "distill_loss": 0.12502837181091309, + "epoch": 7.80520346897932, + "step": 23400 + }, + { + "epoch": 7.80520346897932, + "ref_ce_loss": 0.060486406087875366, + "step": 23400 + }, + { + "epoch": 7.808539026017344, + "loss": 0.3236, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "grad_norm": 3.594543695449829, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "learning_rate": 4.504001797108692e-07, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "loss": 0.21197645366191864, + "step": 23410 + }, + { + "ce_loss": 0.033052023500204086, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "distill_loss": 0.12372135370969772, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "ref_ce_loss": 0.03794866055250168, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "loss": 0.22799083590507507, + "step": 23410 + }, + { + "ce_loss": 0.012985233217477798, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "distill_loss": 0.18273982405662537, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "ref_ce_loss": 0.0319737084209919, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "loss": 0.32432064414024353, + "step": 23410 + }, + { + "ce_loss": 0.03222603350877762, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "distill_loss": 0.2243024706840515, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "ref_ce_loss": 0.03382931277155876, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "loss": 0.19178549945354462, + "step": 23410 + }, + { + "ce_loss": 0.029627012088894844, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "distill_loss": 0.12371587753295898, + "epoch": 7.808539026017344, + "step": 23410 + }, + { + "epoch": 7.808539026017344, + "ref_ce_loss": 0.038338132202625275, + "step": 23410 + }, + { + "epoch": 7.811874583055371, + "loss": 0.324, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "grad_norm": 3.236091375350952, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "learning_rate": 4.3485101655582057e-07, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "loss": 0.3116261065006256, + "step": 23420 + }, + { + "ce_loss": 0.039746448397636414, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "distill_loss": 0.12467524409294128, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "ref_ce_loss": 0.054104890674352646, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "loss": 0.346844881772995, + "step": 23420 + }, + { + "ce_loss": 0.05025966838002205, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "distill_loss": 0.19769015908241272, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "ref_ce_loss": 0.06749259680509567, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "loss": 0.22081111371517181, + "step": 23420 + }, + { + "ce_loss": 0.04156069457530975, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "distill_loss": 0.13512654602527618, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "ref_ce_loss": 0.04406435787677765, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "loss": 0.24682305753231049, + "step": 23420 + }, + { + "ce_loss": 0.019891489297151566, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "distill_loss": 0.1656222641468048, + "epoch": 7.811874583055371, + "step": 23420 + }, + { + "epoch": 7.811874583055371, + "ref_ce_loss": 0.061124760657548904, + "step": 23420 + }, + { + "epoch": 7.815210140093396, + "loss": 0.2939, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "grad_norm": 4.592631816864014, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "learning_rate": 4.1957460123389074e-07, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "loss": 0.23414938151836395, + "step": 23430 + }, + { + "ce_loss": 0.02321458049118519, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "distill_loss": 0.12797735631465912, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "ref_ce_loss": 0.05279207602143288, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "loss": 0.24856454133987427, + "step": 23430 + }, + { + "ce_loss": 0.012455428019165993, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "distill_loss": 0.1560458242893219, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "ref_ce_loss": 0.06088119000196457, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "loss": 0.39245137572288513, + "step": 23430 + }, + { + "ce_loss": 0.0226961188018322, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "distill_loss": 0.21423345804214478, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "ref_ce_loss": 0.055108457803726196, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "loss": 0.3869752883911133, + "step": 23430 + }, + { + "ce_loss": 0.027073608711361885, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "distill_loss": 0.18560923635959625, + "epoch": 7.815210140093396, + "step": 23430 + }, + { + "epoch": 7.815210140093396, + "ref_ce_loss": 0.060218311846256256, + "step": 23430 + }, + { + "epoch": 7.818545697131421, + "loss": 0.334, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "grad_norm": 4.625476837158203, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "learning_rate": 4.045709616032122e-07, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "loss": 0.40051406621932983, + "step": 23440 + }, + { + "ce_loss": 0.024206679314374924, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "distill_loss": 0.2373490333557129, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "ref_ce_loss": 0.049871303141117096, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "loss": 0.48238322138786316, + "step": 23440 + }, + { + "ce_loss": 0.03466752916574478, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "distill_loss": 0.32631319761276245, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "ref_ce_loss": 0.050187211483716965, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "loss": 0.2827662527561188, + "step": 23440 + }, + { + "ce_loss": 0.01909097470343113, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "distill_loss": 0.21451322734355927, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "ref_ce_loss": 0.04905122518539429, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "loss": 0.17934754490852356, + "step": 23440 + }, + { + "ce_loss": 0.0021979936864227057, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "distill_loss": 0.10636477172374725, + "epoch": 7.818545697131421, + "step": 23440 + }, + { + "epoch": 7.818545697131421, + "ref_ce_loss": 0.01800953969359398, + "step": 23440 + }, + { + "epoch": 7.821881254169447, + "loss": 0.3487, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "grad_norm": 7.102529525756836, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "learning_rate": 3.89840125024532e-07, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "loss": 0.45607298612594604, + "step": 23450 + }, + { + "ce_loss": 0.06953687965869904, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "distill_loss": 0.13492602109909058, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "ref_ce_loss": 0.07326891273260117, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "loss": 0.2469940036535263, + "step": 23450 + }, + { + "ce_loss": 0.042449042201042175, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "distill_loss": 0.13725703954696655, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "ref_ce_loss": 0.04501740261912346, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "loss": 0.3980846107006073, + "step": 23450 + }, + { + "ce_loss": 0.0027305660769343376, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "distill_loss": 0.3378257751464844, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "ref_ce_loss": 0.04068752005696297, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "loss": 0.26008281111717224, + "step": 23450 + }, + { + "ce_loss": 0.06028151512145996, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "distill_loss": 0.1407707929611206, + "epoch": 7.821881254169447, + "step": 23450 + }, + { + "epoch": 7.821881254169447, + "ref_ce_loss": 0.04119338467717171, + "step": 23450 + }, + { + "epoch": 7.825216811207472, + "loss": 0.3276, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "grad_norm": 2.3489158153533936, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "learning_rate": 3.753821183610617e-07, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "loss": 0.5334453582763672, + "step": 23460 + }, + { + "ce_loss": 0.021312382072210312, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "distill_loss": 0.19238534569740295, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "ref_ce_loss": 0.05404605343937874, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "loss": 0.33135032653808594, + "step": 23460 + }, + { + "ce_loss": 0.07059341669082642, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "distill_loss": 0.15060116350650787, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "ref_ce_loss": 0.06151656061410904, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "loss": 0.24996304512023926, + "step": 23460 + }, + { + "ce_loss": 0.027928778901696205, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "distill_loss": 0.16909432411193848, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "ref_ce_loss": 0.039568815380334854, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "loss": 0.21376681327819824, + "step": 23460 + }, + { + "ce_loss": 0.019738253206014633, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "distill_loss": 0.14254921674728394, + "epoch": 7.825216811207472, + "step": 23460 + }, + { + "epoch": 7.825216811207472, + "ref_ce_loss": 0.037193119525909424, + "step": 23460 + }, + { + "epoch": 7.828552368245497, + "loss": 0.3357, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "grad_norm": 9.519171714782715, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "learning_rate": 3.611969679785109e-07, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "loss": 0.20791323482990265, + "step": 23470 + }, + { + "ce_loss": 0.023825544863939285, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "distill_loss": 0.15400870144367218, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "ref_ce_loss": 0.02994854561984539, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "loss": 0.4624814987182617, + "step": 23470 + }, + { + "ce_loss": 0.06506424397230148, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "distill_loss": 0.21073627471923828, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "ref_ce_loss": 0.07934938371181488, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "loss": 0.2593517005443573, + "step": 23470 + }, + { + "ce_loss": 0.002018554601818323, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "distill_loss": 0.2089160680770874, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "ref_ce_loss": 0.03180965408682823, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "loss": 0.4130901098251343, + "step": 23470 + }, + { + "ce_loss": 0.053077515214681625, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "distill_loss": 0.19720834493637085, + "epoch": 7.828552368245497, + "step": 23470 + }, + { + "epoch": 7.828552368245497, + "ref_ce_loss": 0.08093966543674469, + "step": 23470 + }, + { + "epoch": 7.831887925283523, + "loss": 0.3327, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "grad_norm": 4.104872703552246, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "learning_rate": 3.4728469974500404e-07, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "loss": 0.23577287793159485, + "step": 23480 + }, + { + "ce_loss": 0.017805000767111778, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "distill_loss": 0.10591477900743484, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "ref_ce_loss": 0.05476228520274162, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "loss": 0.20567350089550018, + "step": 23480 + }, + { + "ce_loss": 0.011999239213764668, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "distill_loss": 0.12367543578147888, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "ref_ce_loss": 0.03651357442140579, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "loss": 0.2672771215438843, + "step": 23480 + }, + { + "ce_loss": 0.03912220522761345, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "distill_loss": 0.1666669398546219, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "ref_ce_loss": 0.04421749338507652, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "loss": 0.357584148645401, + "step": 23480 + }, + { + "ce_loss": 0.009105579927563667, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "distill_loss": 0.16976270079612732, + "epoch": 7.831887925283523, + "step": 23480 + }, + { + "epoch": 7.831887925283523, + "ref_ce_loss": 0.050727520138025284, + "step": 23480 + }, + { + "epoch": 7.835223482321548, + "loss": 0.2992, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "grad_norm": 3.172041177749634, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "learning_rate": 3.3364533903101343e-07, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "loss": 0.3321012854576111, + "step": 23490 + }, + { + "ce_loss": 0.06076185032725334, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "distill_loss": 0.16718757152557373, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "ref_ce_loss": 0.06900273263454437, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "loss": 0.16614539921283722, + "step": 23490 + }, + { + "ce_loss": 0.011339708231389523, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "distill_loss": 0.10029933601617813, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "ref_ce_loss": 0.021356748417019844, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "loss": 0.27904677391052246, + "step": 23490 + }, + { + "ce_loss": 0.01795981265604496, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "distill_loss": 0.14555040001869202, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "ref_ce_loss": 0.0673777163028717, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "loss": 0.4013983905315399, + "step": 23490 + }, + { + "ce_loss": 0.03549182042479515, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "distill_loss": 0.24230507016181946, + "epoch": 7.835223482321548, + "step": 23490 + }, + { + "epoch": 7.835223482321548, + "ref_ce_loss": 0.06218017637729645, + "step": 23490 + }, + { + "epoch": 7.838559039359573, + "loss": 0.3185, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "grad_norm": 3.262314558029175, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "learning_rate": 3.202789107093762e-07, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "loss": 0.46285945177078247, + "step": 23500 + }, + { + "ce_loss": 0.016897911205887794, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "distill_loss": 0.15950268507003784, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "ref_ce_loss": 0.06392750889062881, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "loss": 0.2956462502479553, + "step": 23500 + }, + { + "ce_loss": 0.007706071715801954, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "distill_loss": 0.17924486100673676, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "ref_ce_loss": 0.04117276147007942, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "loss": 0.2654488980770111, + "step": 23500 + }, + { + "ce_loss": 0.04544892907142639, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "distill_loss": 0.15256768465042114, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "ref_ce_loss": 0.043563321232795715, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "loss": 0.3258354365825653, + "step": 23500 + }, + { + "ce_loss": 0.028348958119750023, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "distill_loss": 0.16680380702018738, + "epoch": 7.838559039359573, + "step": 23500 + }, + { + "epoch": 7.838559039359573, + "ref_ce_loss": 0.041485149413347244, + "step": 23500 + }, + { + "epoch": 7.841894596397599, + "loss": 0.3601, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "grad_norm": 2.688413619995117, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "learning_rate": 3.0718543915517756e-07, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "loss": 0.21694333851337433, + "step": 23510 + }, + { + "ce_loss": 0.005275980569422245, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "distill_loss": 0.1437319666147232, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "ref_ce_loss": 0.04064946621656418, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "loss": 0.14688892662525177, + "step": 23510 + }, + { + "ce_loss": 0.005248190835118294, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "distill_loss": 0.12220533192157745, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "ref_ce_loss": 0.019273869693279266, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "loss": 0.28784316778182983, + "step": 23510 + }, + { + "ce_loss": 0.0338323637843132, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "distill_loss": 0.19960737228393555, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "ref_ce_loss": 0.05415638908743858, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "loss": 0.26387009024620056, + "step": 23510 + }, + { + "ce_loss": 0.035002514719963074, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "distill_loss": 0.13795851171016693, + "epoch": 7.841894596397599, + "step": 23510 + }, + { + "epoch": 7.841894596397599, + "ref_ce_loss": 0.054587170481681824, + "step": 23510 + }, + { + "epoch": 7.845230153435624, + "loss": 0.2939, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "grad_norm": 3.071887731552124, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "learning_rate": 2.943649482457344e-07, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "loss": 0.19254055619239807, + "step": 23520 + }, + { + "ce_loss": 0.016048870980739594, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "distill_loss": 0.11593937128782272, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "ref_ce_loss": 0.045436661690473557, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "loss": 0.2930757999420166, + "step": 23520 + }, + { + "ce_loss": 0.030346477404236794, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "distill_loss": 0.15989623963832855, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "ref_ce_loss": 0.05691306293010712, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "loss": 0.39531221985816956, + "step": 23520 + }, + { + "ce_loss": 0.015122991986572742, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "distill_loss": 0.28203243017196655, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "ref_ce_loss": 0.06494788825511932, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "loss": 0.33640390634536743, + "step": 23520 + }, + { + "ce_loss": 0.08751996606588364, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "distill_loss": 0.15109512209892273, + "epoch": 7.845230153435624, + "step": 23520 + }, + { + "epoch": 7.845230153435624, + "ref_ce_loss": 0.04479917511343956, + "step": 23520 + }, + { + "epoch": 7.8485657104736495, + "loss": 0.2966, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "grad_norm": 2.259341239929199, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "learning_rate": 2.81817461360595e-07, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "loss": 0.32070645689964294, + "step": 23530 + }, + { + "ce_loss": 0.025516116991639137, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "distill_loss": 0.22206610441207886, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "ref_ce_loss": 0.04203583672642708, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "loss": 0.30409732460975647, + "step": 23530 + }, + { + "ce_loss": 0.008006962016224861, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "distill_loss": 0.2180558443069458, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "ref_ce_loss": 0.07791711390018463, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "loss": 0.546816885471344, + "step": 23530 + }, + { + "ce_loss": 0.010635165497660637, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "distill_loss": 0.3440686762332916, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "ref_ce_loss": 0.06196082383394241, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "loss": 0.35767361521720886, + "step": 23530 + }, + { + "ce_loss": 0.06103089079260826, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "distill_loss": 0.127061128616333, + "epoch": 7.8485657104736495, + "step": 23530 + }, + { + "epoch": 7.8485657104736495, + "ref_ce_loss": 0.05860128253698349, + "step": 23530 + }, + { + "epoch": 7.851901267511675, + "loss": 0.3247, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "grad_norm": 3.118084669113159, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "learning_rate": 2.695430013813726e-07, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "loss": 0.36181530356407166, + "step": 23540 + }, + { + "ce_loss": 0.06342129409313202, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "distill_loss": 0.20927932858467102, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "ref_ce_loss": 0.05260811373591423, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "loss": 0.4537930488586426, + "step": 23540 + }, + { + "ce_loss": 0.06014955788850784, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "distill_loss": 0.20907607674598694, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "ref_ce_loss": 0.06965752691030502, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "loss": 0.3706998825073242, + "step": 23540 + }, + { + "ce_loss": 0.04698286950588226, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "distill_loss": 0.14785529673099518, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "ref_ce_loss": 0.06559489667415619, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "loss": 0.2899203896522522, + "step": 23540 + }, + { + "ce_loss": 0.03699249029159546, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "distill_loss": 0.11820313334465027, + "epoch": 7.851901267511675, + "step": 23540 + }, + { + "epoch": 7.851901267511675, + "ref_ce_loss": 0.0372081995010376, + "step": 23540 + }, + { + "epoch": 7.8552368245497, + "loss": 0.3375, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "grad_norm": 5.25492525100708, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "learning_rate": 2.5754159069187876e-07, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "loss": 0.25230440497398376, + "step": 23550 + }, + { + "ce_loss": 0.04072759672999382, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "distill_loss": 0.13626137375831604, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "ref_ce_loss": 0.05338587239384651, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "loss": 0.3071846663951874, + "step": 23550 + }, + { + "ce_loss": 0.025177132338285446, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "distill_loss": 0.18812674283981323, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "ref_ce_loss": 0.06389034539461136, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "loss": 0.27324751019477844, + "step": 23550 + }, + { + "ce_loss": 0.0538780502974987, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "distill_loss": 0.1565019190311432, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "ref_ce_loss": 0.04335758090019226, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "loss": 0.2749263048171997, + "step": 23550 + }, + { + "ce_loss": 0.028948500752449036, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "distill_loss": 0.17235276103019714, + "epoch": 7.8552368245497, + "step": 23550 + }, + { + "epoch": 7.8552368245497, + "ref_ce_loss": 0.05884414166212082, + "step": 23550 + }, + { + "epoch": 7.8585723815877255, + "loss": 0.3391, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "grad_norm": 2.6983072757720947, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "learning_rate": 2.458132511779565e-07, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "loss": 0.24318772554397583, + "step": 23560 + }, + { + "ce_loss": 0.029617050662636757, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "distill_loss": 0.13530349731445312, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "ref_ce_loss": 0.043204981833696365, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "loss": 0.25192779302597046, + "step": 23560 + }, + { + "ce_loss": 0.008975867182016373, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "distill_loss": 0.1775493025779724, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "ref_ce_loss": 0.03575044870376587, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "loss": 0.22142037749290466, + "step": 23560 + }, + { + "ce_loss": 0.02612127549946308, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "distill_loss": 0.10729941725730896, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "ref_ce_loss": 0.06410533934831619, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "loss": 0.2029029130935669, + "step": 23560 + }, + { + "ce_loss": 0.01904483139514923, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "distill_loss": 0.13385045528411865, + "epoch": 7.8585723815877255, + "step": 23560 + }, + { + "epoch": 7.8585723815877255, + "ref_ce_loss": 0.04983455315232277, + "step": 23560 + }, + { + "epoch": 7.861907938625751, + "loss": 0.3088, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "grad_norm": 3.6548972129821777, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "learning_rate": 2.3435800422744733e-07, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "loss": 0.35257551074028015, + "step": 23570 + }, + { + "ce_loss": 0.030391503125429153, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "distill_loss": 0.1383948028087616, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "ref_ce_loss": 0.06357093900442123, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "loss": 0.3334633409976959, + "step": 23570 + }, + { + "ce_loss": 0.05558884143829346, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "distill_loss": 0.20167039334774017, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "ref_ce_loss": 0.061854951083660126, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "loss": 0.30800241231918335, + "step": 23570 + }, + { + "ce_loss": 0.0217773225158453, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "distill_loss": 0.16537217795848846, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "ref_ce_loss": 0.05210186913609505, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "loss": 0.2675626277923584, + "step": 23570 + }, + { + "ce_loss": 0.034394048154354095, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "distill_loss": 0.12180566042661667, + "epoch": 7.861907938625751, + "step": 23570 + }, + { + "epoch": 7.861907938625751, + "ref_ce_loss": 0.06949251890182495, + "step": 23570 + }, + { + "epoch": 7.865243495663776, + "loss": 0.3309, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "grad_norm": 3.2683770656585693, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "learning_rate": 2.2317587073020782e-07, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "loss": 0.24115531146526337, + "step": 23580 + }, + { + "ce_loss": 0.04724248871207237, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "distill_loss": 0.14451994001865387, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "ref_ce_loss": 0.04924272000789642, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "loss": 0.25895681977272034, + "step": 23580 + }, + { + "ce_loss": 0.0406833216547966, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "distill_loss": 0.12115300446748734, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "ref_ce_loss": 0.04782585799694061, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "loss": 0.4971678853034973, + "step": 23580 + }, + { + "ce_loss": 0.02928864024579525, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "distill_loss": 0.40851619839668274, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "ref_ce_loss": 0.05929753929376602, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "loss": 0.2548585534095764, + "step": 23580 + }, + { + "ce_loss": 0.02185073494911194, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "distill_loss": 0.19197684526443481, + "epoch": 7.865243495663776, + "step": 23580 + }, + { + "epoch": 7.865243495663776, + "ref_ce_loss": 0.040667105466127396, + "step": 23580 + }, + { + "epoch": 7.868579052701802, + "loss": 0.3371, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "grad_norm": 2.3236372470855713, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "learning_rate": 2.122668710780595e-07, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "loss": 0.32696297764778137, + "step": 23590 + }, + { + "ce_loss": 0.04291388392448425, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "distill_loss": 0.1363409459590912, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "ref_ce_loss": 0.05377655103802681, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "loss": 0.294104665517807, + "step": 23590 + }, + { + "ce_loss": 0.023383162915706635, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "distill_loss": 0.13425928354263306, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "ref_ce_loss": 0.045569244772195816, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "loss": 0.16625121235847473, + "step": 23590 + }, + { + "ce_loss": 0.019522182643413544, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "distill_loss": 0.11160635203123093, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "ref_ce_loss": 0.0349382720887661, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "loss": 0.2597416937351227, + "step": 23590 + }, + { + "ce_loss": 0.04834597557783127, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "distill_loss": 0.14939779043197632, + "epoch": 7.868579052701802, + "step": 23590 + }, + { + "epoch": 7.868579052701802, + "ref_ce_loss": 0.061613187193870544, + "step": 23590 + }, + { + "epoch": 7.871914609739827, + "loss": 0.3061, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "grad_norm": 1.84232759475708, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "learning_rate": 2.016310251646891e-07, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "loss": 0.19464971125125885, + "step": 23600 + }, + { + "ce_loss": 0.0028662015683948994, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "distill_loss": 0.15298305451869965, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "ref_ce_loss": 0.03864739090204239, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "loss": 0.5443291664123535, + "step": 23600 + }, + { + "ce_loss": 0.04491385072469711, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "distill_loss": 0.2117350846529007, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "ref_ce_loss": 0.049522701650857925, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "loss": 0.30889323353767395, + "step": 23600 + }, + { + "ce_loss": 0.03567614406347275, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "distill_loss": 0.17977787554264069, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "ref_ce_loss": 0.05225006118416786, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "loss": 0.2686649560928345, + "step": 23600 + }, + { + "ce_loss": 0.019530225545167923, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "distill_loss": 0.20698407292366028, + "epoch": 7.871914609739827, + "step": 23600 + }, + { + "epoch": 7.871914609739827, + "ref_ce_loss": 0.030849579721689224, + "step": 23600 + }, + { + "epoch": 7.875250166777852, + "loss": 0.3492, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "grad_norm": 4.917800426483154, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "learning_rate": 1.9126835238569838e-07, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "loss": 0.2599946856498718, + "step": 23610 + }, + { + "ce_loss": 0.03415341302752495, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "distill_loss": 0.14789728820323944, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "ref_ce_loss": 0.03027048334479332, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "loss": 0.14788013696670532, + "step": 23610 + }, + { + "ce_loss": 0.012526956386864185, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "distill_loss": 0.10251648724079132, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "ref_ce_loss": 0.03267025575041771, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "loss": 0.1945207267999649, + "step": 23610 + }, + { + "ce_loss": 0.01578592136502266, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "distill_loss": 0.1346457153558731, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "ref_ce_loss": 0.04352135583758354, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "loss": 0.23414014279842377, + "step": 23610 + }, + { + "ce_loss": 0.04965338110923767, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "distill_loss": 0.1356775015592575, + "epoch": 7.875250166777852, + "step": 23610 + }, + { + "epoch": 7.875250166777852, + "ref_ce_loss": 0.04857119172811508, + "step": 23610 + }, + { + "epoch": 7.878585723815878, + "loss": 0.2745, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "grad_norm": 2.187544822692871, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "learning_rate": 1.811788716385043e-07, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "loss": 0.39947035908699036, + "step": 23620 + }, + { + "ce_loss": 0.009549811482429504, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "distill_loss": 0.20651386678218842, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "ref_ce_loss": 0.05287281423807144, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "loss": 0.4136260747909546, + "step": 23620 + }, + { + "ce_loss": 0.05234153941273689, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "distill_loss": 0.2478913813829422, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "ref_ce_loss": 0.046148356050252914, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "loss": 0.25423839688301086, + "step": 23620 + }, + { + "ce_loss": 0.03949936106801033, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "distill_loss": 0.15055415034294128, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "ref_ce_loss": 0.05031810700893402, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "loss": 0.2075003683567047, + "step": 23620 + }, + { + "ce_loss": 0.020473573356866837, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "distill_loss": 0.11563518643379211, + "epoch": 7.878585723815878, + "step": 23620 + }, + { + "epoch": 7.878585723815878, + "ref_ce_loss": 0.040492944419384, + "step": 23620 + }, + { + "epoch": 7.881921280853903, + "loss": 0.3381, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "grad_norm": 2.8946211338043213, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "learning_rate": 1.7136260132235568e-07, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "loss": 0.3676464557647705, + "step": 23630 + }, + { + "ce_loss": 0.04826905578374863, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "distill_loss": 0.22701773047447205, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "ref_ce_loss": 0.06122293695807457, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "loss": 0.2739025950431824, + "step": 23630 + }, + { + "ce_loss": 0.05898267775774002, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "distill_loss": 0.15476536750793457, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "ref_ce_loss": 0.0593196265399456, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "loss": 0.2718009948730469, + "step": 23630 + }, + { + "ce_loss": 0.02158510684967041, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "distill_loss": 0.151712566614151, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "ref_ce_loss": 0.03668779134750366, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "loss": 0.18738394975662231, + "step": 23630 + }, + { + "ce_loss": 0.03252572566270828, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "distill_loss": 0.11352349817752838, + "epoch": 7.881921280853903, + "step": 23630 + }, + { + "epoch": 7.881921280853903, + "ref_ce_loss": 0.031237930059432983, + "step": 23630 + }, + { + "epoch": 7.885256837891928, + "loss": 0.3456, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "grad_norm": 3.666982650756836, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "learning_rate": 1.6181955933824987e-07, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "loss": 0.2636154890060425, + "step": 23640 + }, + { + "ce_loss": 0.015169094316661358, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "distill_loss": 0.19849561154842377, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "ref_ce_loss": 0.028414104133844376, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "loss": 0.24772925674915314, + "step": 23640 + }, + { + "ce_loss": 0.010222217999398708, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "distill_loss": 0.174020454287529, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "ref_ce_loss": 0.03140445426106453, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "loss": 0.36517012119293213, + "step": 23640 + }, + { + "ce_loss": 0.04773040488362312, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "distill_loss": 0.23224672675132751, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "ref_ce_loss": 0.08495187014341354, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "loss": 0.31244736909866333, + "step": 23640 + }, + { + "ce_loss": 0.061612311750650406, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "distill_loss": 0.15047255158424377, + "epoch": 7.885256837891928, + "step": 23640 + }, + { + "epoch": 7.885256837891928, + "ref_ce_loss": 0.06343311071395874, + "step": 23640 + }, + { + "epoch": 7.888592394929954, + "loss": 0.3154, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "grad_norm": 3.4513959884643555, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "learning_rate": 1.5254976308891608e-07, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "loss": 0.266707181930542, + "step": 23650 + }, + { + "ce_loss": 0.011998561210930347, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "distill_loss": 0.18357567489147186, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "ref_ce_loss": 0.04288625344634056, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "loss": 0.24832196533679962, + "step": 23650 + }, + { + "ce_loss": 0.01114476379007101, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "distill_loss": 0.1261119693517685, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "ref_ce_loss": 0.05986611172556877, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "loss": 0.20678839087486267, + "step": 23650 + }, + { + "ce_loss": 0.008814916014671326, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "distill_loss": 0.1221609115600586, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "ref_ce_loss": 0.03747597709298134, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "loss": 0.2413693517446518, + "step": 23650 + }, + { + "ce_loss": 0.020267309620976448, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "distill_loss": 0.14778374135494232, + "epoch": 7.888592394929954, + "step": 23650 + }, + { + "epoch": 7.888592394929954, + "ref_ce_loss": 0.0390714630484581, + "step": 23650 + }, + { + "epoch": 7.891927951967979, + "loss": 0.294, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "grad_norm": 3.68865704536438, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "learning_rate": 1.435532294788322e-07, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "loss": 0.2128949910402298, + "step": 23660 + }, + { + "ce_loss": 0.004833103623241186, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "distill_loss": 0.158272847533226, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "ref_ce_loss": 0.027962271124124527, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "loss": 0.3560689091682434, + "step": 23660 + }, + { + "ce_loss": 0.03083612397313118, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "distill_loss": 0.17447644472122192, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "ref_ce_loss": 0.05506594106554985, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "loss": 0.38912326097488403, + "step": 23660 + }, + { + "ce_loss": 0.068735271692276, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "distill_loss": 0.16929185390472412, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "ref_ce_loss": 0.062119368463754654, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "loss": 0.2961581349372864, + "step": 23660 + }, + { + "ce_loss": 0.032267630100250244, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "distill_loss": 0.1588132381439209, + "epoch": 7.891927951967979, + "step": 23660 + }, + { + "epoch": 7.891927951967979, + "ref_ce_loss": 0.04926518350839615, + "step": 23660 + }, + { + "epoch": 7.895263509006004, + "loss": 0.2938, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "grad_norm": 3.3204846382141113, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "learning_rate": 1.3482997491410796e-07, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "loss": 0.2511689364910126, + "step": 23670 + }, + { + "ce_loss": 0.028505954891443253, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "distill_loss": 0.11724421381950378, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "ref_ce_loss": 0.05377810075879097, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "loss": 0.33648693561553955, + "step": 23670 + }, + { + "ce_loss": 0.0159254539757967, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "distill_loss": 0.12790490686893463, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "ref_ce_loss": 0.042552024126052856, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "loss": 0.3551349341869354, + "step": 23670 + }, + { + "ce_loss": 0.03814506158232689, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "distill_loss": 0.15532177686691284, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "ref_ce_loss": 0.04683928191661835, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "loss": 0.3603041470050812, + "step": 23670 + }, + { + "ce_loss": 0.028399253264069557, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "distill_loss": 0.2274635136127472, + "epoch": 7.895263509006004, + "step": 23670 + }, + { + "epoch": 7.895263509006004, + "ref_ce_loss": 0.052271027117967606, + "step": 23670 + }, + { + "epoch": 7.89859906604403, + "loss": 0.309, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "grad_norm": 3.2109603881835938, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "learning_rate": 1.263800153025185e-07, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "loss": 0.43084007501602173, + "step": 23680 + }, + { + "ce_loss": 0.005710463039577007, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "distill_loss": 0.13513536751270294, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "ref_ce_loss": 0.03500951826572418, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "loss": 0.5318657159805298, + "step": 23680 + }, + { + "ce_loss": 0.007542737293988466, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "distill_loss": 0.1139644905924797, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "ref_ce_loss": 0.052930302917957306, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "loss": 0.2502293288707733, + "step": 23680 + }, + { + "ce_loss": 0.0355820469558239, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "distill_loss": 0.11790099740028381, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "ref_ce_loss": 0.06142692267894745, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "loss": 0.36094042658805847, + "step": 23680 + }, + { + "ce_loss": 0.013484828174114227, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "distill_loss": 0.16219370067119598, + "epoch": 7.89859906604403, + "step": 23680 + }, + { + "epoch": 7.89859906604403, + "ref_ce_loss": 0.0636187344789505, + "step": 23680 + }, + { + "epoch": 7.901934623082055, + "loss": 0.337, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "grad_norm": 2.8887805938720703, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "learning_rate": 1.1820336605347092e-07, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "loss": 0.4276142120361328, + "step": 23690 + }, + { + "ce_loss": 0.06052268669009209, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "distill_loss": 0.19310811161994934, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "ref_ce_loss": 0.07668992877006531, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "loss": 0.20680920779705048, + "step": 23690 + }, + { + "ce_loss": 0.022963564842939377, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "distill_loss": 0.11855762451887131, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "ref_ce_loss": 0.06482337415218353, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "loss": 0.29407379031181335, + "step": 23690 + }, + { + "ce_loss": 0.026345551013946533, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "distill_loss": 0.15834666788578033, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "ref_ce_loss": 0.047213222831487656, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "loss": 0.16668519377708435, + "step": 23690 + }, + { + "ce_loss": 0.010949512012302876, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "distill_loss": 0.10162533074617386, + "epoch": 7.901934623082055, + "step": 23690 + }, + { + "epoch": 7.901934623082055, + "ref_ce_loss": 0.03732268139719963, + "step": 23690 + }, + { + "epoch": 7.90527018012008, + "loss": 0.3171, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "grad_norm": 2.6990199089050293, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "learning_rate": 1.1030004207793763e-07, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "loss": 0.18577899038791656, + "step": 23700 + }, + { + "ce_loss": 0.03319680318236351, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "distill_loss": 0.09356150031089783, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "ref_ce_loss": 0.03948800638318062, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "loss": 0.2524036467075348, + "step": 23700 + }, + { + "ce_loss": 0.024047965183854103, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "distill_loss": 0.12219231575727463, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "ref_ce_loss": 0.07101521641016006, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "loss": 0.1832265704870224, + "step": 23700 + }, + { + "ce_loss": 0.012943029403686523, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "distill_loss": 0.11475443094968796, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "ref_ce_loss": 0.043082673102617264, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "loss": 0.36426877975463867, + "step": 23700 + }, + { + "ce_loss": 0.0561620257794857, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "distill_loss": 0.1449742615222931, + "epoch": 7.90527018012008, + "step": 23700 + }, + { + "epoch": 7.90527018012008, + "ref_ce_loss": 0.07117841392755508, + "step": 23700 + }, + { + "epoch": 7.908605737158106, + "loss": 0.3003, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "grad_norm": 3.6066665649414062, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "learning_rate": 1.0267005778847315e-07, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "loss": 0.2224166840314865, + "step": 23710 + }, + { + "ce_loss": 0.04008515179157257, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "distill_loss": 0.1341482698917389, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "ref_ce_loss": 0.04800305888056755, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "loss": 0.337680459022522, + "step": 23710 + }, + { + "ce_loss": 0.025626100599765778, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "distill_loss": 0.19002515077590942, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "ref_ce_loss": 0.07956057786941528, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "loss": 0.33845025300979614, + "step": 23710 + }, + { + "ce_loss": 0.034407805651426315, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "distill_loss": 0.24543067812919617, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "ref_ce_loss": 0.058323897421360016, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "loss": 0.293621301651001, + "step": 23710 + }, + { + "ce_loss": 0.008468334563076496, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "distill_loss": 0.2118399441242218, + "epoch": 7.908605737158106, + "step": 23710 + }, + { + "epoch": 7.908605737158106, + "ref_ce_loss": 0.03856439143419266, + "step": 23710 + }, + { + "epoch": 7.911941294196131, + "loss": 0.3429, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "grad_norm": 4.594830513000488, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "learning_rate": 9.53134270991307e-08, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "loss": 0.2774868309497833, + "step": 23720 + }, + { + "ce_loss": 0.07063006609678268, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "distill_loss": 0.12501201033592224, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "ref_ce_loss": 0.0638425201177597, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "loss": 0.23774346709251404, + "step": 23720 + }, + { + "ce_loss": 0.008322431705892086, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "distill_loss": 0.1158476173877716, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "ref_ce_loss": 0.04607711732387543, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "loss": 0.5005284547805786, + "step": 23720 + }, + { + "ce_loss": 0.10049159824848175, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "distill_loss": 0.25245940685272217, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "ref_ce_loss": 0.034680433571338654, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "loss": 0.3965238332748413, + "step": 23720 + }, + { + "ce_loss": 0.08287633210420609, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "distill_loss": 0.21766790747642517, + "epoch": 7.911941294196131, + "step": 23720 + }, + { + "epoch": 7.911941294196131, + "ref_ce_loss": 0.047307275235652924, + "step": 23720 + }, + { + "epoch": 7.9152768512341565, + "loss": 0.285, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "grad_norm": 3.145902156829834, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "learning_rate": 8.823016342554557e-08, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "loss": 0.2347753643989563, + "step": 23730 + }, + { + "ce_loss": 0.010778088122606277, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "distill_loss": 0.14141809940338135, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "ref_ce_loss": 0.032311439514160156, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "loss": 0.3499191105365753, + "step": 23730 + }, + { + "ce_loss": 0.10165178775787354, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "distill_loss": 0.1755722612142563, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "ref_ce_loss": 0.07252631336450577, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "loss": 0.4272885322570801, + "step": 23730 + }, + { + "ce_loss": 0.014664656482636929, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "distill_loss": 0.29934969544410706, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "ref_ce_loss": 0.06960371136665344, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "loss": 0.14407768845558167, + "step": 23730 + }, + { + "ce_loss": 0.009971325285732746, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "distill_loss": 0.11345706135034561, + "epoch": 7.9152768512341565, + "step": 23730 + }, + { + "epoch": 7.9152768512341565, + "ref_ce_loss": 0.0139697827398777, + "step": 23730 + }, + { + "epoch": 7.918612408272182, + "loss": 0.3189, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "grad_norm": 3.545828104019165, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "learning_rate": 8.14202796847685e-08, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "loss": 0.17268899083137512, + "step": 23740 + }, + { + "ce_loss": 0.02548079378902912, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "distill_loss": 0.10526704788208008, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "ref_ce_loss": 0.0418148934841156, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "loss": 0.19164453446865082, + "step": 23740 + }, + { + "ce_loss": 0.004120680969208479, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "distill_loss": 0.1119823306798935, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "ref_ce_loss": 0.0482141338288784, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "loss": 0.39481455087661743, + "step": 23740 + }, + { + "ce_loss": 0.0830196812748909, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "distill_loss": 0.24142134189605713, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "ref_ce_loss": 0.030993618071079254, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "loss": 0.31761011481285095, + "step": 23740 + }, + { + "ce_loss": 0.04722367972135544, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "distill_loss": 0.17737168073654175, + "epoch": 7.918612408272182, + "step": 23740 + }, + { + "epoch": 7.918612408272182, + "ref_ce_loss": 0.07376193255186081, + "step": 23740 + }, + { + "epoch": 7.921947965310207, + "loss": 0.2852, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "grad_norm": 2.1956324577331543, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "learning_rate": 7.488378829534903e-08, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "loss": 0.45576271414756775, + "step": 23750 + }, + { + "ce_loss": 0.05745376646518707, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "distill_loss": 0.26547667384147644, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "ref_ce_loss": 0.06741317361593246, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "loss": 0.3387796878814697, + "step": 23750 + }, + { + "ce_loss": 0.03637593984603882, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "distill_loss": 0.19918107986450195, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "ref_ce_loss": 0.05107717961072922, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "loss": 0.2869316637516022, + "step": 23750 + }, + { + "ce_loss": 0.015967372804880142, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "distill_loss": 0.12445250153541565, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "ref_ce_loss": 0.04145001992583275, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "loss": 0.31970643997192383, + "step": 23750 + }, + { + "ce_loss": 0.03916800767183304, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "distill_loss": 0.14288796484470367, + "epoch": 7.921947965310207, + "step": 23750 + }, + { + "epoch": 7.921947965310207, + "ref_ce_loss": 0.051019471138715744, + "step": 23750 + }, + { + "epoch": 7.9252835223482325, + "loss": 0.3092, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "grad_norm": 4.615090370178223, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "learning_rate": 6.862070117725216e-08, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "loss": 0.32668691873550415, + "step": 23760 + }, + { + "ce_loss": 0.03978271782398224, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "distill_loss": 0.1851176917552948, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "ref_ce_loss": 0.050581347197294235, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "loss": 0.2914896607398987, + "step": 23760 + }, + { + "ce_loss": 0.0444149523973465, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "distill_loss": 0.14117684960365295, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "ref_ce_loss": 0.037499092519283295, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "loss": 0.3269153833389282, + "step": 23760 + }, + { + "ce_loss": 0.004225606564432383, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "distill_loss": 0.20178727805614471, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "ref_ce_loss": 0.06147385388612747, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "loss": 0.342536985874176, + "step": 23760 + }, + { + "ce_loss": 0.013994453474879265, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "distill_loss": 0.25507158041000366, + "epoch": 7.9252835223482325, + "step": 23760 + }, + { + "epoch": 7.9252835223482325, + "ref_ce_loss": 0.05506007373332977, + "step": 23760 + }, + { + "epoch": 7.928619079386258, + "loss": 0.3146, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "grad_norm": 3.3470752239227295, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "learning_rate": 6.263102975190837e-08, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "loss": 0.395652711391449, + "step": 23770 + }, + { + "ce_loss": 0.08911252021789551, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "distill_loss": 0.21665403246879578, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "ref_ce_loss": 0.07107770442962646, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "loss": 0.3040449619293213, + "step": 23770 + }, + { + "ce_loss": 0.015351009555161, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "distill_loss": 0.2228090912103653, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "ref_ce_loss": 0.04557941108942032, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "loss": 0.23056939244270325, + "step": 23770 + }, + { + "ce_loss": 0.023988118395209312, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "distill_loss": 0.13429005444049835, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "ref_ce_loss": 0.07207389920949936, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "loss": 0.28277847170829773, + "step": 23770 + }, + { + "ce_loss": 0.046883635222911835, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "distill_loss": 0.12595364451408386, + "epoch": 7.928619079386258, + "step": 23770 + }, + { + "epoch": 7.928619079386258, + "ref_ce_loss": 0.04364423453807831, + "step": 23770 + }, + { + "epoch": 7.931954636424283, + "loss": 0.3274, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "grad_norm": 3.726837158203125, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "learning_rate": 5.6914784942097004e-08, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "loss": 0.2981261909008026, + "step": 23780 + }, + { + "ce_loss": 0.010031945072114468, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "distill_loss": 0.22223161160945892, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "ref_ce_loss": 0.040669411420822144, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "loss": 0.2453022301197052, + "step": 23780 + }, + { + "ce_loss": 0.034803520888090134, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "distill_loss": 0.16121885180473328, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "ref_ce_loss": 0.04921687766909599, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "loss": 0.3023455739021301, + "step": 23780 + }, + { + "ce_loss": 0.05195079371333122, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "distill_loss": 0.15900222957134247, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "ref_ce_loss": 0.04697020724415779, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "loss": 0.20833349227905273, + "step": 23780 + }, + { + "ce_loss": 0.02568138763308525, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "distill_loss": 0.11793918907642365, + "epoch": 7.931954636424283, + "step": 23780 + }, + { + "epoch": 7.931954636424283, + "ref_ce_loss": 0.042938802391290665, + "step": 23780 + }, + { + "epoch": 7.935290193462309, + "loss": 0.3196, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "grad_norm": 3.1810896396636963, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "learning_rate": 5.14719771720129e-08, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "loss": 0.4055737555027008, + "step": 23790 + }, + { + "ce_loss": 0.06018650531768799, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "distill_loss": 0.22380036115646362, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "ref_ce_loss": 0.098446786403656, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "loss": 0.24172917008399963, + "step": 23790 + }, + { + "ce_loss": 0.029822878539562225, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "distill_loss": 0.15046468377113342, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "ref_ce_loss": 0.04222504422068596, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "loss": 0.19609327614307404, + "step": 23790 + }, + { + "ce_loss": 0.0036498629488050938, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "distill_loss": 0.11934056133031845, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "ref_ce_loss": 0.04515659064054489, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "loss": 0.6010411977767944, + "step": 23790 + }, + { + "ce_loss": 0.04046886786818504, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "distill_loss": 0.225779190659523, + "epoch": 7.935290193462309, + "step": 23790 + }, + { + "epoch": 7.935290193462309, + "ref_ce_loss": 0.0491827093064785, + "step": 23790 + }, + { + "epoch": 7.938625750500334, + "loss": 0.3412, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "grad_norm": 2.477982521057129, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "learning_rate": 4.6302616367149824e-08, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "loss": 0.5360799431800842, + "step": 23800 + }, + { + "ce_loss": 0.028202766552567482, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "distill_loss": 0.30352216958999634, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "ref_ce_loss": 0.04931466281414032, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "loss": 0.24504601955413818, + "step": 23800 + }, + { + "ce_loss": 0.053709227591753006, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "distill_loss": 0.1536063253879547, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "ref_ce_loss": 0.03759874030947685, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "loss": 0.17871206998825073, + "step": 23800 + }, + { + "ce_loss": 0.01082976907491684, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "distill_loss": 0.09340672194957733, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "ref_ce_loss": 0.018215442076325417, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "loss": 0.41664665937423706, + "step": 23800 + }, + { + "ce_loss": 0.007084306795150042, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "distill_loss": 0.18884873390197754, + "epoch": 7.938625750500334, + "step": 23800 + }, + { + "epoch": 7.938625750500334, + "ref_ce_loss": 0.034425389021635056, + "step": 23800 + }, + { + "epoch": 7.941961307538359, + "loss": 0.3242, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "grad_norm": 2.6601004600524902, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "learning_rate": 4.140671195443368e-08, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "loss": 0.17185519635677338, + "step": 23810 + }, + { + "ce_loss": 0.004476575180888176, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "distill_loss": 0.08696437627077103, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "ref_ce_loss": 0.03971419855952263, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "loss": 0.4425716996192932, + "step": 23810 + }, + { + "ce_loss": 0.012841652147471905, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "distill_loss": 0.21129585802555084, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "ref_ce_loss": 0.04600348696112633, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "loss": 0.3691619038581848, + "step": 23810 + }, + { + "ce_loss": 0.08319517225027084, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "distill_loss": 0.1942371129989624, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "ref_ce_loss": 0.06677474826574326, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "loss": 0.3194017708301544, + "step": 23810 + }, + { + "ce_loss": 0.04230117425322533, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "distill_loss": 0.14899131655693054, + "epoch": 7.941961307538359, + "step": 23810 + }, + { + "epoch": 7.941961307538359, + "ref_ce_loss": 0.05892167240381241, + "step": 23810 + }, + { + "epoch": 7.945296864576385, + "loss": 0.3391, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "grad_norm": 3.832427978515625, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "learning_rate": 3.678427286202268e-08, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "loss": 0.4109659790992737, + "step": 23820 + }, + { + "ce_loss": 0.0191717017441988, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "distill_loss": 0.14880934357643127, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "ref_ce_loss": 0.021254807710647583, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "loss": 0.33141595125198364, + "step": 23820 + }, + { + "ce_loss": 0.04109518229961395, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "distill_loss": 0.24267660081386566, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "ref_ce_loss": 0.04730328917503357, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "loss": 0.18887729942798615, + "step": 23820 + }, + { + "ce_loss": 0.017927071079611778, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "distill_loss": 0.12233343720436096, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "ref_ce_loss": 0.04846320301294327, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "loss": 0.37258389592170715, + "step": 23820 + }, + { + "ce_loss": 0.011785218492150307, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "distill_loss": 0.12698237597942352, + "epoch": 7.945296864576385, + "step": 23820 + }, + { + "epoch": 7.945296864576385, + "ref_ce_loss": 0.07257344573736191, + "step": 23820 + }, + { + "epoch": 7.94863242161441, + "loss": 0.3368, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "grad_norm": 2.7429282665252686, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "learning_rate": 3.243530751944057e-08, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "loss": 0.22189457714557648, + "step": 23830 + }, + { + "ce_loss": 0.027077307924628258, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "distill_loss": 0.16582459211349487, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "ref_ce_loss": 0.028566185384988785, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "loss": 0.363383412361145, + "step": 23830 + }, + { + "ce_loss": 0.03507964313030243, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "distill_loss": 0.15669700503349304, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "ref_ce_loss": 0.06525471806526184, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "loss": 0.2155691385269165, + "step": 23830 + }, + { + "ce_loss": 0.01610867865383625, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "distill_loss": 0.14725200831890106, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "ref_ce_loss": 0.024043144658207893, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "loss": 0.21124151349067688, + "step": 23830 + }, + { + "ce_loss": 0.018030596897006035, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "distill_loss": 0.12607772648334503, + "epoch": 7.94863242161441, + "step": 23830 + }, + { + "epoch": 7.94863242161441, + "ref_ce_loss": 0.033300936222076416, + "step": 23830 + }, + { + "epoch": 7.951967978652435, + "loss": 0.2854, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "grad_norm": 2.5058624744415283, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "learning_rate": 2.8359823857476705e-08, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "loss": 0.29359400272369385, + "step": 23840 + }, + { + "ce_loss": 0.09036729484796524, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "distill_loss": 0.15034735202789307, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "ref_ce_loss": 0.03662831336259842, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "loss": 0.36976414918899536, + "step": 23840 + }, + { + "ce_loss": 0.060340650379657745, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "distill_loss": 0.1331426501274109, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "ref_ce_loss": 0.04315692558884621, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "loss": 0.2175612449645996, + "step": 23840 + }, + { + "ce_loss": 0.004586022812873125, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "distill_loss": 0.13164444267749786, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "ref_ce_loss": 0.05861181393265724, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "loss": 0.2547665238380432, + "step": 23840 + }, + { + "ce_loss": 0.036511220037937164, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "distill_loss": 0.10626421123743057, + "epoch": 7.951967978652435, + "step": 23840 + }, + { + "epoch": 7.951967978652435, + "ref_ce_loss": 0.04119854047894478, + "step": 23840 + }, + { + "epoch": 7.955303535690461, + "loss": 0.3294, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "grad_norm": 3.057788372039795, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "learning_rate": 2.4557829308202714e-08, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "loss": 0.2910195291042328, + "step": 23850 + }, + { + "ce_loss": 0.011250575073063374, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "distill_loss": 0.21544837951660156, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "ref_ce_loss": 0.043316297233104706, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "loss": 0.3057239055633545, + "step": 23850 + }, + { + "ce_loss": 0.02447928860783577, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "distill_loss": 0.22899970412254333, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "ref_ce_loss": 0.051885154098272324, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "loss": 0.24294811487197876, + "step": 23850 + }, + { + "ce_loss": 0.011331611312925816, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "distill_loss": 0.16551092267036438, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "ref_ce_loss": 0.042633138597011566, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "loss": 0.6378141641616821, + "step": 23850 + }, + { + "ce_loss": 0.02813078835606575, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "distill_loss": 0.14803311228752136, + "epoch": 7.955303535690461, + "step": 23850 + }, + { + "epoch": 7.955303535690461, + "ref_ce_loss": 0.05598514899611473, + "step": 23850 + }, + { + "epoch": 7.958639092728486, + "loss": 0.3353, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "grad_norm": 3.8991682529449463, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "learning_rate": 2.102933080497249e-08, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "loss": 0.1944175809621811, + "step": 23860 + }, + { + "ce_loss": 0.012968351133167744, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "distill_loss": 0.1464006006717682, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "ref_ce_loss": 0.034779004752635956, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "loss": 0.34716102480888367, + "step": 23860 + }, + { + "ce_loss": 0.0167516078799963, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "distill_loss": 0.22644741833209991, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "ref_ce_loss": 0.05799233913421631, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "loss": 0.24939963221549988, + "step": 23860 + }, + { + "ce_loss": 0.021768810227513313, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "distill_loss": 0.15550008416175842, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "ref_ce_loss": 0.049455419182777405, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "loss": 0.46806520223617554, + "step": 23860 + }, + { + "ce_loss": 0.029784483835101128, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "distill_loss": 0.20984099805355072, + "epoch": 7.958639092728486, + "step": 23860 + }, + { + "epoch": 7.958639092728486, + "ref_ce_loss": 0.057520121335983276, + "step": 23860 + }, + { + "epoch": 7.961974649766511, + "loss": 0.3539, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "grad_norm": 3.716891288757324, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "learning_rate": 1.7774334782372224e-08, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "loss": 0.3307017385959625, + "step": 23870 + }, + { + "ce_loss": 0.002352922922000289, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "distill_loss": 0.16378623247146606, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "ref_ce_loss": 0.07698747515678406, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "loss": 0.348903626203537, + "step": 23870 + }, + { + "ce_loss": 0.026433821767568588, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "distill_loss": 0.2482292652130127, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "ref_ce_loss": 0.05007699504494667, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "loss": 0.2856520712375641, + "step": 23870 + }, + { + "ce_loss": 0.06673577427864075, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "distill_loss": 0.1341075897216797, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "ref_ce_loss": 0.04442819580435753, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "loss": 0.3865754008293152, + "step": 23870 + }, + { + "ce_loss": 0.009188508614897728, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "distill_loss": 0.19062168896198273, + "epoch": 7.961974649766511, + "step": 23870 + }, + { + "epoch": 7.961974649766511, + "ref_ce_loss": 0.05311084911227226, + "step": 23870 + }, + { + "epoch": 7.965310206804537, + "loss": 0.308, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "grad_norm": 2.8505074977874756, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "learning_rate": 1.4792847176220423e-08, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "loss": 0.5729832053184509, + "step": 23880 + }, + { + "ce_loss": 0.03645119443535805, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "distill_loss": 0.12541188299655914, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "ref_ce_loss": 0.04846469312906265, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "loss": 0.47117969393730164, + "step": 23880 + }, + { + "ce_loss": 0.039367545396089554, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "distill_loss": 0.2976151704788208, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "ref_ce_loss": 0.0795765370130539, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "loss": 0.3643397390842438, + "step": 23880 + }, + { + "ce_loss": 0.031012022867798805, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "distill_loss": 0.22613711655139923, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "ref_ce_loss": 0.050101302564144135, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "loss": 0.26308199763298035, + "step": 23880 + }, + { + "ce_loss": 0.015335160307586193, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "distill_loss": 0.13855913281440735, + "epoch": 7.965310206804537, + "step": 23880 + }, + { + "epoch": 7.965310206804537, + "ref_ce_loss": 0.06943170726299286, + "step": 23880 + }, + { + "epoch": 7.968645763842562, + "loss": 0.3256, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "grad_norm": 4.51202392578125, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "learning_rate": 1.2084873423584552e-08, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "loss": 0.33923307061195374, + "step": 23890 + }, + { + "ce_loss": 0.018250662833452225, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "distill_loss": 0.25644806027412415, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "ref_ce_loss": 0.04556307941675186, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "loss": 0.4055192470550537, + "step": 23890 + }, + { + "ce_loss": 0.05485742539167404, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "distill_loss": 0.2536390423774719, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "ref_ce_loss": 0.0628480315208435, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "loss": 0.3442380726337433, + "step": 23890 + }, + { + "ce_loss": 0.05271763727068901, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "distill_loss": 0.15627413988113403, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "ref_ce_loss": 0.11837184429168701, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "loss": 0.2478257417678833, + "step": 23890 + }, + { + "ce_loss": 0.0008815837791189551, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "distill_loss": 0.17156405746936798, + "epoch": 7.968645763842562, + "step": 23890 + }, + { + "epoch": 7.968645763842562, + "ref_ce_loss": 0.03145066648721695, + "step": 23890 + }, + { + "epoch": 7.971981320880587, + "loss": 0.3203, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "grad_norm": 2.825669050216675, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "learning_rate": 9.65041846273107e-09, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "loss": 0.3642113208770752, + "step": 23900 + }, + { + "ce_loss": 0.02877146378159523, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "distill_loss": 0.18471139669418335, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "ref_ce_loss": 0.07360993325710297, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "loss": 0.30159834027290344, + "step": 23900 + }, + { + "ce_loss": 0.008763355202972889, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "distill_loss": 0.20540040731430054, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "ref_ce_loss": 0.05317322164773941, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "loss": 0.36580002307891846, + "step": 23900 + }, + { + "ce_loss": 0.01582839898765087, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "distill_loss": 0.28203338384628296, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "ref_ce_loss": 0.04814530909061432, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "loss": 0.26183679699897766, + "step": 23900 + }, + { + "ce_loss": 0.021706560626626015, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "distill_loss": 0.1578688770532608, + "epoch": 7.971981320880587, + "step": 23900 + }, + { + "epoch": 7.971981320880587, + "ref_ce_loss": 0.04011444374918938, + "step": 23900 + }, + { + "epoch": 7.975316877918613, + "loss": 0.3224, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "grad_norm": 2.1414968967437744, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "learning_rate": 7.489486733142091e-09, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "loss": 0.3207355737686157, + "step": 23910 + }, + { + "ce_loss": 0.006373913958668709, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "distill_loss": 0.24951599538326263, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "ref_ce_loss": 0.06476053595542908, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "loss": 0.20579034090042114, + "step": 23910 + }, + { + "ce_loss": 0.004123230930417776, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "distill_loss": 0.10635682940483093, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "ref_ce_loss": 0.04275263100862503, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "loss": 0.32259857654571533, + "step": 23910 + }, + { + "ce_loss": 0.019134141504764557, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "distill_loss": 0.1968674212694168, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "ref_ce_loss": 0.03675852715969086, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "loss": 0.30018824338912964, + "step": 23910 + }, + { + "ce_loss": 0.015749013051390648, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "distill_loss": 0.1425822377204895, + "epoch": 7.975316877918613, + "step": 23910 + }, + { + "epoch": 7.975316877918613, + "ref_ce_loss": 0.06256047636270523, + "step": 23910 + }, + { + "epoch": 7.978652434956638, + "loss": 0.3346, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "grad_norm": 3.1309776306152344, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "learning_rate": 5.602082175515388e-09, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "loss": 0.27254971861839294, + "step": 23920 + }, + { + "ce_loss": 0.02068653143942356, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "distill_loss": 0.13779519498348236, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "ref_ce_loss": 0.052534569054841995, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "loss": 0.19287121295928955, + "step": 23920 + }, + { + "ce_loss": 0.008895105682313442, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "distill_loss": 0.11923294514417648, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "ref_ce_loss": 0.035864803940057755, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "loss": 0.17826569080352783, + "step": 23920 + }, + { + "ce_loss": 0.018720723688602448, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "distill_loss": 0.10931409150362015, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "ref_ce_loss": 0.033927470445632935, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "loss": 0.2010970562696457, + "step": 23920 + }, + { + "ce_loss": 0.03899431601166725, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "distill_loss": 0.11069336533546448, + "epoch": 7.978652434956638, + "step": 23920 + }, + { + "epoch": 7.978652434956638, + "ref_ce_loss": 0.026145868003368378, + "step": 23920 + }, + { + "epoch": 7.9819879919946635, + "loss": 0.3318, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "grad_norm": 3.460667371749878, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "learning_rate": 3.988208231747725e-09, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "loss": 0.3221658170223236, + "step": 23930 + }, + { + "ce_loss": 0.006332428194582462, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "distill_loss": 0.1634363979101181, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "ref_ce_loss": 0.07524649053812027, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "loss": 0.20411431789398193, + "step": 23930 + }, + { + "ce_loss": 0.02330736815929413, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "distill_loss": 0.09716067463159561, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "ref_ce_loss": 0.051996905356645584, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "loss": 0.3010110557079315, + "step": 23930 + }, + { + "ce_loss": 0.023202329874038696, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "distill_loss": 0.23072974383831024, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "ref_ce_loss": 0.04667457938194275, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "loss": 0.28928112983703613, + "step": 23930 + }, + { + "ce_loss": 0.017655836418271065, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "distill_loss": 0.12512949109077454, + "epoch": 7.9819879919946635, + "step": 23930 + }, + { + "epoch": 7.9819879919946635, + "ref_ce_loss": 0.03689692169427872, + "step": 23930 + }, + { + "epoch": 7.985323549032689, + "loss": 0.3457, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "grad_norm": 2.695939064025879, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "learning_rate": 2.6478678448682567e-09, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "loss": 0.2173708826303482, + "step": 23940 + }, + { + "ce_loss": 0.04703154414892197, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "distill_loss": 0.11919710040092468, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "ref_ce_loss": 0.03168262913823128, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "loss": 0.21798212826251984, + "step": 23940 + }, + { + "ce_loss": 0.007845278829336166, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "distill_loss": 0.12254693359136581, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "ref_ce_loss": 0.05747520551085472, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "loss": 0.3193615972995758, + "step": 23940 + }, + { + "ce_loss": 0.015500759705901146, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "distill_loss": 0.11406480520963669, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "ref_ce_loss": 0.03308076784014702, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "loss": 0.19805341958999634, + "step": 23940 + }, + { + "ce_loss": 0.015502206049859524, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "distill_loss": 0.15302814543247223, + "epoch": 7.985323549032689, + "step": 23940 + }, + { + "epoch": 7.985323549032689, + "ref_ce_loss": 0.02931785024702549, + "step": 23940 + }, + { + "epoch": 7.988659106070714, + "loss": 0.2937, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "grad_norm": 5.747377395629883, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "learning_rate": 1.5810634591550964e-09, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "loss": 0.4828079342842102, + "step": 23950 + }, + { + "ce_loss": 0.07146650552749634, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "distill_loss": 0.3095947504043579, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "ref_ce_loss": 0.04873261973261833, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "loss": 0.4262124300003052, + "step": 23950 + }, + { + "ce_loss": 0.027315624058246613, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "distill_loss": 0.16963714361190796, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "ref_ce_loss": 0.060540758073329926, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "loss": 0.42749008536338806, + "step": 23950 + }, + { + "ce_loss": 0.051516093313694, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "distill_loss": 0.23047225177288055, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "ref_ce_loss": 0.08982308208942413, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "loss": 0.43480467796325684, + "step": 23950 + }, + { + "ce_loss": 0.03439018130302429, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "distill_loss": 0.16861560940742493, + "epoch": 7.988659106070714, + "step": 23950 + }, + { + "epoch": 7.988659106070714, + "ref_ce_loss": 0.04760503023862839, + "step": 23950 + }, + { + "epoch": 7.9919946631087395, + "loss": 0.3183, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "grad_norm": 2.7786331176757812, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "learning_rate": 7.877970200353967e-10, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "loss": 0.2137977033853531, + "step": 23960 + }, + { + "ce_loss": 0.02110884338617325, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "distill_loss": 0.12767818570137024, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "ref_ce_loss": 0.034436240792274475, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "loss": 0.26209551095962524, + "step": 23960 + }, + { + "ce_loss": 0.0033823607955127954, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "distill_loss": 0.211116760969162, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "ref_ce_loss": 0.04747690632939339, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "loss": 0.4075537323951721, + "step": 23960 + }, + { + "ce_loss": 0.013052606023848057, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "distill_loss": 0.17281053960323334, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "ref_ce_loss": 0.05032519996166229, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "loss": 0.29942646622657776, + "step": 23960 + }, + { + "ce_loss": 0.051360610872507095, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "distill_loss": 0.1980040967464447, + "epoch": 7.9919946631087395, + "step": 23960 + }, + { + "epoch": 7.9919946631087395, + "ref_ce_loss": 0.049913190305233, + "step": 23960 + }, + { + "epoch": 7.995330220146765, + "loss": 0.2936, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "grad_norm": 2.5634679794311523, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "learning_rate": 2.680699741186565e-10, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "loss": 0.2984486222267151, + "step": 23970 + }, + { + "ce_loss": 0.031040744855999947, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "distill_loss": 0.16967520117759705, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "ref_ce_loss": 0.07135186344385147, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "loss": 0.28316909074783325, + "step": 23970 + }, + { + "ce_loss": 0.014297720044851303, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "distill_loss": 0.18668489158153534, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "ref_ce_loss": 0.05426119267940521, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "loss": 0.3653208911418915, + "step": 23970 + }, + { + "ce_loss": 0.11517569422721863, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "distill_loss": 0.19170519709587097, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "ref_ce_loss": 0.04996060952544212, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "loss": 0.28267958760261536, + "step": 23970 + }, + { + "ce_loss": 0.08331085741519928, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "distill_loss": 0.13024798035621643, + "epoch": 7.995330220146765, + "step": 23970 + }, + { + "epoch": 7.995330220146765, + "ref_ce_loss": 0.06889346987009048, + "step": 23970 + }, + { + "epoch": 7.99866577718479, + "loss": 0.3085, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "grad_norm": 2.1330296993255615, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "learning_rate": 2.188326918006744e-11, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "loss": 0.9639419317245483, + "step": 23980 + }, + { + "ce_loss": 0.06816727668046951, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "distill_loss": 0.14574238657951355, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "ref_ce_loss": 0.06669634580612183, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "loss": 0.3473624587059021, + "step": 23980 + }, + { + "ce_loss": 0.04842541366815567, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "distill_loss": 0.23083451390266418, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "ref_ce_loss": 0.051015496253967285, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "loss": 0.21875524520874023, + "step": 23980 + }, + { + "ce_loss": 0.01979643665254116, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "distill_loss": 0.11872906237840652, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "ref_ce_loss": 0.03821733966469765, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "loss": 0.20249786972999573, + "step": 23980 + }, + { + "ce_loss": 0.03200726583600044, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "distill_loss": 0.13593564927577972, + "epoch": 7.99866577718479, + "step": 23980 + }, + { + "epoch": 7.99866577718479, + "ref_ce_loss": 0.034453000873327255, + "step": 23980 + }, + { + "epoch": 8.0, + "step": 23984, + "train_runtime": 145201.4613 + }, + { + "epoch": 8.0, + "step": 23984, + "train_samples_per_second": 21.142 + }, + { + "epoch": 8.0, + "step": 23984, + "train_steps_per_second": 0.165 + }, + { + "epoch": 8.0, + "step": 23984, + "total_flos": 0.0 + }, + { + "epoch": 8.0, + "step": 23984, + "train_loss": 0.537310748268478 + } + ], + "logging_steps": 10, + "max_steps": 23984, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}