diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17340 @@ +{ + "best_metric": 0.48, + "best_model_checkpoint": "5c_4/checkpoint-9360", + "epoch": 99.01, + "eval_steps": 500, + "global_step": 23400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00042735042735042735, + "grad_norm": 6.5477800369262695, + "learning_rate": 4.273504273504274e-08, + "loss": 1.341, + "step": 10 + }, + { + "epoch": 0.0008547008547008547, + "grad_norm": 7.9439568519592285, + "learning_rate": 8.547008547008549e-08, + "loss": 1.3047, + "step": 20 + }, + { + "epoch": 0.001282051282051282, + "grad_norm": 6.425937175750732, + "learning_rate": 1.282051282051282e-07, + "loss": 1.3641, + "step": 30 + }, + { + "epoch": 0.0017094017094017094, + "grad_norm": 6.959607124328613, + "learning_rate": 1.7094017094017097e-07, + "loss": 1.3257, + "step": 40 + }, + { + "epoch": 0.002136752136752137, + "grad_norm": 6.3154730796813965, + "learning_rate": 2.136752136752137e-07, + "loss": 1.3008, + "step": 50 + }, + { + "epoch": 0.002564102564102564, + "grad_norm": 6.919759750366211, + "learning_rate": 2.564102564102564e-07, + "loss": 1.3322, + "step": 60 + }, + { + "epoch": 0.0029914529914529917, + "grad_norm": 6.870538711547852, + "learning_rate": 2.991452991452992e-07, + "loss": 1.313, + "step": 70 + }, + { + "epoch": 0.003418803418803419, + "grad_norm": 6.823713302612305, + "learning_rate": 3.4188034188034194e-07, + "loss": 1.3179, + "step": 80 + }, + { + "epoch": 0.0038461538461538464, + "grad_norm": 7.863243579864502, + "learning_rate": 3.846153846153847e-07, + "loss": 1.2466, + "step": 90 + }, + { + "epoch": 0.004273504273504274, + "grad_norm": 7.5396881103515625, + "learning_rate": 4.273504273504274e-07, + "loss": 1.2098, + "step": 100 + }, + { + "epoch": 0.004700854700854701, + "grad_norm": 8.087361335754395, + "learning_rate": 4.700854700854701e-07, + "loss": 1.2309, + "step": 110 + }, + { + "epoch": 0.005128205128205128, + "grad_norm": 8.429758071899414, + "learning_rate": 5.128205128205128e-07, + "loss": 1.1752, + "step": 120 + }, + { + "epoch": 0.005555555555555556, + "grad_norm": 9.436071395874023, + "learning_rate": 5.555555555555555e-07, + "loss": 1.1474, + "step": 130 + }, + { + "epoch": 0.005982905982905983, + "grad_norm": 80.47299194335938, + "learning_rate": 5.982905982905984e-07, + "loss": 1.1764, + "step": 140 + }, + { + "epoch": 0.00641025641025641, + "grad_norm": 16.705190658569336, + "learning_rate": 6.41025641025641e-07, + "loss": 1.1038, + "step": 150 + }, + { + "epoch": 0.006837606837606838, + "grad_norm": 22.808284759521484, + "learning_rate": 6.837606837606839e-07, + "loss": 1.1758, + "step": 160 + }, + { + "epoch": 0.007264957264957265, + "grad_norm": 23.328384399414062, + "learning_rate": 7.264957264957266e-07, + "loss": 1.2182, + "step": 170 + }, + { + "epoch": 0.007692307692307693, + "grad_norm": 16.98978614807129, + "learning_rate": 7.692307692307694e-07, + "loss": 0.8752, + "step": 180 + }, + { + "epoch": 0.00811965811965812, + "grad_norm": 197.35919189453125, + "learning_rate": 8.11965811965812e-07, + "loss": 1.122, + "step": 190 + }, + { + "epoch": 0.008547008547008548, + "grad_norm": 32.2603759765625, + "learning_rate": 8.547008547008548e-07, + "loss": 1.1263, + "step": 200 + }, + { + "epoch": 0.008974358974358974, + "grad_norm": 15.653111457824707, + "learning_rate": 8.974358974358975e-07, + "loss": 1.3598, + "step": 210 + }, + { + "epoch": 0.009401709401709401, + "grad_norm": 13.729981422424316, + "learning_rate": 9.401709401709402e-07, + "loss": 0.6189, + "step": 220 + }, + { + "epoch": 0.009829059829059829, + "grad_norm": 30.82917594909668, + "learning_rate": 9.829059829059829e-07, + "loss": 0.9641, + "step": 230 + }, + { + "epoch": 0.01, + "eval_accuracy": 0.4, + "eval_loss": 1.4837762117385864, + "eval_runtime": 34.3992, + "eval_samples_per_second": 0.727, + "eval_steps_per_second": 0.727, + "step": 234 + }, + { + "epoch": 1.0002564102564102, + "grad_norm": 13.610635757446289, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.9614, + "step": 240 + }, + { + "epoch": 1.0006837606837606, + "grad_norm": 51.50576400756836, + "learning_rate": 1.0683760683760685e-06, + "loss": 0.9305, + "step": 250 + }, + { + "epoch": 1.001111111111111, + "grad_norm": 101.85962677001953, + "learning_rate": 1.111111111111111e-06, + "loss": 0.923, + "step": 260 + }, + { + "epoch": 1.0015384615384615, + "grad_norm": 15.406578063964844, + "learning_rate": 1.153846153846154e-06, + "loss": 1.558, + "step": 270 + }, + { + "epoch": 1.001965811965812, + "grad_norm": 49.64127731323242, + "learning_rate": 1.1965811965811968e-06, + "loss": 0.749, + "step": 280 + }, + { + "epoch": 1.0023931623931623, + "grad_norm": 23.893503189086914, + "learning_rate": 1.2393162393162394e-06, + "loss": 0.7217, + "step": 290 + }, + { + "epoch": 1.0028205128205128, + "grad_norm": 12.812580108642578, + "learning_rate": 1.282051282051282e-06, + "loss": 1.3277, + "step": 300 + }, + { + "epoch": 1.0032478632478632, + "grad_norm": 8.36343002319336, + "learning_rate": 1.324786324786325e-06, + "loss": 0.4727, + "step": 310 + }, + { + "epoch": 1.0036752136752136, + "grad_norm": 3.717958450317383, + "learning_rate": 1.3675213675213678e-06, + "loss": 1.0901, + "step": 320 + }, + { + "epoch": 1.004102564102564, + "grad_norm": 120.84101867675781, + "learning_rate": 1.4102564102564104e-06, + "loss": 1.2407, + "step": 330 + }, + { + "epoch": 1.0045299145299145, + "grad_norm": 76.76756286621094, + "learning_rate": 1.4529914529914531e-06, + "loss": 1.2918, + "step": 340 + }, + { + "epoch": 1.004957264957265, + "grad_norm": 66.22538757324219, + "learning_rate": 1.4957264957264957e-06, + "loss": 2.1196, + "step": 350 + }, + { + "epoch": 1.0053846153846153, + "grad_norm": 49.87553024291992, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.9441, + "step": 360 + }, + { + "epoch": 1.0058119658119657, + "grad_norm": 64.26871490478516, + "learning_rate": 1.5811965811965813e-06, + "loss": 1.3502, + "step": 370 + }, + { + "epoch": 1.0062393162393162, + "grad_norm": 1.687718391418457, + "learning_rate": 1.623931623931624e-06, + "loss": 2.3065, + "step": 380 + }, + { + "epoch": 1.0066666666666666, + "grad_norm": 1.3850126266479492, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.9136, + "step": 390 + }, + { + "epoch": 1.007094017094017, + "grad_norm": 1.278273344039917, + "learning_rate": 1.7094017094017097e-06, + "loss": 1.5894, + "step": 400 + }, + { + "epoch": 1.0075213675213675, + "grad_norm": 63.78499221801758, + "learning_rate": 1.7521367521367522e-06, + "loss": 1.1296, + "step": 410 + }, + { + "epoch": 1.0079487179487179, + "grad_norm": 0.9789712429046631, + "learning_rate": 1.794871794871795e-06, + "loss": 1.6205, + "step": 420 + }, + { + "epoch": 1.0083760683760683, + "grad_norm": 0.5999449491500854, + "learning_rate": 1.8376068376068378e-06, + "loss": 1.7319, + "step": 430 + }, + { + "epoch": 1.0088034188034187, + "grad_norm": 0.31746140122413635, + "learning_rate": 1.8803418803418804e-06, + "loss": 0.643, + "step": 440 + }, + { + "epoch": 1.0092307692307692, + "grad_norm": 76.34538269042969, + "learning_rate": 1.9230769230769234e-06, + "loss": 2.4496, + "step": 450 + }, + { + "epoch": 1.0096581196581196, + "grad_norm": 5.805349349975586, + "learning_rate": 1.9658119658119658e-06, + "loss": 1.5439, + "step": 460 + }, + { + "epoch": 1.01, + "eval_accuracy": 0.4, + "eval_loss": 3.7125139236450195, + "eval_runtime": 33.4952, + "eval_samples_per_second": 0.746, + "eval_steps_per_second": 0.746, + "step": 468 + }, + { + "epoch": 2.00008547008547, + "grad_norm": 72.71012878417969, + "learning_rate": 2.008547008547009e-06, + "loss": 1.7175, + "step": 470 + }, + { + "epoch": 2.0005128205128204, + "grad_norm": 0.6215125918388367, + "learning_rate": 2.0512820512820513e-06, + "loss": 2.393, + "step": 480 + }, + { + "epoch": 2.000940170940171, + "grad_norm": 1.5661790370941162, + "learning_rate": 2.094017094017094e-06, + "loss": 2.3866, + "step": 490 + }, + { + "epoch": 2.0013675213675213, + "grad_norm": 0.6268726587295532, + "learning_rate": 2.136752136752137e-06, + "loss": 2.4783, + "step": 500 + }, + { + "epoch": 2.0017948717948717, + "grad_norm": 95.59490203857422, + "learning_rate": 2.1794871794871797e-06, + "loss": 2.3849, + "step": 510 + }, + { + "epoch": 2.002222222222222, + "grad_norm": 0.4951792061328888, + "learning_rate": 2.222222222222222e-06, + "loss": 1.2576, + "step": 520 + }, + { + "epoch": 2.0026495726495726, + "grad_norm": 0.2503603398799896, + "learning_rate": 2.2649572649572653e-06, + "loss": 1.2306, + "step": 530 + }, + { + "epoch": 2.003076923076923, + "grad_norm": 52.469234466552734, + "learning_rate": 2.307692307692308e-06, + "loss": 1.8782, + "step": 540 + }, + { + "epoch": 2.0035042735042734, + "grad_norm": 0.20398114621639252, + "learning_rate": 2.3504273504273504e-06, + "loss": 0.5924, + "step": 550 + }, + { + "epoch": 2.003931623931624, + "grad_norm": 71.42668914794922, + "learning_rate": 2.3931623931623937e-06, + "loss": 3.2851, + "step": 560 + }, + { + "epoch": 2.0043589743589743, + "grad_norm": 56.71471405029297, + "learning_rate": 2.435897435897436e-06, + "loss": 1.2107, + "step": 570 + }, + { + "epoch": 2.0047863247863247, + "grad_norm": 0.25754937529563904, + "learning_rate": 2.478632478632479e-06, + "loss": 1.874, + "step": 580 + }, + { + "epoch": 2.005213675213675, + "grad_norm": 1.0091850757598877, + "learning_rate": 2.5213675213675216e-06, + "loss": 2.3834, + "step": 590 + }, + { + "epoch": 2.0056410256410255, + "grad_norm": 0.3749796450138092, + "learning_rate": 2.564102564102564e-06, + "loss": 1.1427, + "step": 600 + }, + { + "epoch": 2.006068376068376, + "grad_norm": 0.9544910788536072, + "learning_rate": 2.606837606837607e-06, + "loss": 2.4185, + "step": 610 + }, + { + "epoch": 2.0064957264957264, + "grad_norm": 0.21911416947841644, + "learning_rate": 2.64957264957265e-06, + "loss": 1.2006, + "step": 620 + }, + { + "epoch": 2.006923076923077, + "grad_norm": 0.1923692524433136, + "learning_rate": 2.6923076923076923e-06, + "loss": 1.2047, + "step": 630 + }, + { + "epoch": 2.0073504273504272, + "grad_norm": 0.2782430052757263, + "learning_rate": 2.7350427350427355e-06, + "loss": 1.7899, + "step": 640 + }, + { + "epoch": 2.0077777777777777, + "grad_norm": 0.17398680746555328, + "learning_rate": 2.7777777777777783e-06, + "loss": 1.0863, + "step": 650 + }, + { + "epoch": 2.008205128205128, + "grad_norm": 121.83155059814453, + "learning_rate": 2.8205128205128207e-06, + "loss": 1.805, + "step": 660 + }, + { + "epoch": 2.0086324786324785, + "grad_norm": 6.328676700592041, + "learning_rate": 2.8632478632478635e-06, + "loss": 1.6228, + "step": 670 + }, + { + "epoch": 2.009059829059829, + "grad_norm": 74.6235580444336, + "learning_rate": 2.9059829059829063e-06, + "loss": 3.3584, + "step": 680 + }, + { + "epoch": 2.0094871794871794, + "grad_norm": 0.6486341953277588, + "learning_rate": 2.948717948717949e-06, + "loss": 0.8365, + "step": 690 + }, + { + "epoch": 2.00991452991453, + "grad_norm": 0.20082318782806396, + "learning_rate": 2.9914529914529914e-06, + "loss": 1.2944, + "step": 700 + }, + { + "epoch": 2.01, + "eval_accuracy": 0.4, + "eval_loss": 3.6749236583709717, + "eval_runtime": 33.4285, + "eval_samples_per_second": 0.748, + "eval_steps_per_second": 0.748, + "step": 702 + }, + { + "epoch": 3.0003418803418804, + "grad_norm": 49.130332946777344, + "learning_rate": 3.0341880341880342e-06, + "loss": 3.0762, + "step": 710 + }, + { + "epoch": 3.000769230769231, + "grad_norm": 10.840944290161133, + "learning_rate": 3.0769230769230774e-06, + "loss": 3.0125, + "step": 720 + }, + { + "epoch": 3.0011965811965813, + "grad_norm": 0.9621607661247253, + "learning_rate": 3.11965811965812e-06, + "loss": 1.0516, + "step": 730 + }, + { + "epoch": 3.0016239316239317, + "grad_norm": 0.4117166996002197, + "learning_rate": 3.1623931623931626e-06, + "loss": 0.5509, + "step": 740 + }, + { + "epoch": 3.002051282051282, + "grad_norm": 0.34464403986930847, + "learning_rate": 3.205128205128206e-06, + "loss": 0.5911, + "step": 750 + }, + { + "epoch": 3.0024786324786326, + "grad_norm": 0.21714632213115692, + "learning_rate": 3.247863247863248e-06, + "loss": 1.2975, + "step": 760 + }, + { + "epoch": 3.002905982905983, + "grad_norm": 0.6396281719207764, + "learning_rate": 3.290598290598291e-06, + "loss": 3.169, + "step": 770 + }, + { + "epoch": 3.0033333333333334, + "grad_norm": 0.28683122992515564, + "learning_rate": 3.3333333333333333e-06, + "loss": 1.8898, + "step": 780 + }, + { + "epoch": 3.003760683760684, + "grad_norm": 0.158400759100914, + "learning_rate": 3.3760683760683765e-06, + "loss": 0.6267, + "step": 790 + }, + { + "epoch": 3.0041880341880343, + "grad_norm": 49.82303237915039, + "learning_rate": 3.4188034188034193e-06, + "loss": 0.6848, + "step": 800 + }, + { + "epoch": 3.0046153846153847, + "grad_norm": 0.5617250800132751, + "learning_rate": 3.4615384615384617e-06, + "loss": 3.2734, + "step": 810 + }, + { + "epoch": 3.005042735042735, + "grad_norm": 104.6950912475586, + "learning_rate": 3.5042735042735045e-06, + "loss": 2.4173, + "step": 820 + }, + { + "epoch": 3.0054700854700855, + "grad_norm": 3.587768077850342, + "learning_rate": 3.5470085470085473e-06, + "loss": 0.7486, + "step": 830 + }, + { + "epoch": 3.005897435897436, + "grad_norm": 1.223972201347351, + "learning_rate": 3.58974358974359e-06, + "loss": 2.2701, + "step": 840 + }, + { + "epoch": 3.0063247863247864, + "grad_norm": 84.18639373779297, + "learning_rate": 3.632478632478633e-06, + "loss": 2.5556, + "step": 850 + }, + { + "epoch": 3.006752136752137, + "grad_norm": 0.33438971638679504, + "learning_rate": 3.6752136752136756e-06, + "loss": 1.1486, + "step": 860 + }, + { + "epoch": 3.0071794871794872, + "grad_norm": 0.4742439091205597, + "learning_rate": 3.7179487179487184e-06, + "loss": 1.8177, + "step": 870 + }, + { + "epoch": 3.0076068376068377, + "grad_norm": 0.20394758880138397, + "learning_rate": 3.760683760683761e-06, + "loss": 1.2203, + "step": 880 + }, + { + "epoch": 3.008034188034188, + "grad_norm": 51.59945297241211, + "learning_rate": 3.8034188034188036e-06, + "loss": 1.09, + "step": 890 + }, + { + "epoch": 3.0084615384615385, + "grad_norm": 0.2848564386367798, + "learning_rate": 3.846153846153847e-06, + "loss": 1.3275, + "step": 900 + }, + { + "epoch": 3.008888888888889, + "grad_norm": 0.22904685139656067, + "learning_rate": 3.88888888888889e-06, + "loss": 3.0584, + "step": 910 + }, + { + "epoch": 3.0093162393162394, + "grad_norm": 33.74935531616211, + "learning_rate": 3.9316239316239315e-06, + "loss": 1.7752, + "step": 920 + }, + { + "epoch": 3.00974358974359, + "grad_norm": 0.3561188280582428, + "learning_rate": 3.974358974358974e-06, + "loss": 0.9419, + "step": 930 + }, + { + "epoch": 3.01, + "eval_accuracy": 0.4, + "eval_loss": 3.042177200317383, + "eval_runtime": 33.5077, + "eval_samples_per_second": 0.746, + "eval_steps_per_second": 0.746, + "step": 936 + }, + { + "epoch": 4.00017094017094, + "grad_norm": 61.73008728027344, + "learning_rate": 4.017094017094018e-06, + "loss": 3.1491, + "step": 940 + }, + { + "epoch": 4.00059829059829, + "grad_norm": 32.0042839050293, + "learning_rate": 4.05982905982906e-06, + "loss": 1.1443, + "step": 950 + }, + { + "epoch": 4.001025641025641, + "grad_norm": 0.49480342864990234, + "learning_rate": 4.102564102564103e-06, + "loss": 1.1544, + "step": 960 + }, + { + "epoch": 4.001452991452991, + "grad_norm": 0.1361558586359024, + "learning_rate": 4.145299145299146e-06, + "loss": 0.574, + "step": 970 + }, + { + "epoch": 4.001880341880342, + "grad_norm": 0.10107045620679855, + "learning_rate": 4.188034188034188e-06, + "loss": 2.0072, + "step": 980 + }, + { + "epoch": 4.002307692307692, + "grad_norm": 36.34455871582031, + "learning_rate": 4.230769230769231e-06, + "loss": 1.849, + "step": 990 + }, + { + "epoch": 4.0027350427350425, + "grad_norm": 179.90792846679688, + "learning_rate": 4.273504273504274e-06, + "loss": 1.7365, + "step": 1000 + }, + { + "epoch": 4.003162393162393, + "grad_norm": 38.759735107421875, + "learning_rate": 4.316239316239317e-06, + "loss": 2.1145, + "step": 1010 + }, + { + "epoch": 4.003589743589743, + "grad_norm": 0.1710689514875412, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0095, + "step": 1020 + }, + { + "epoch": 4.004017094017094, + "grad_norm": 0.4290190041065216, + "learning_rate": 4.401709401709402e-06, + "loss": 1.7793, + "step": 1030 + }, + { + "epoch": 4.004444444444444, + "grad_norm": 32.45518493652344, + "learning_rate": 4.444444444444444e-06, + "loss": 1.7186, + "step": 1040 + }, + { + "epoch": 4.004871794871795, + "grad_norm": 36.26081466674805, + "learning_rate": 4.487179487179488e-06, + "loss": 1.7024, + "step": 1050 + }, + { + "epoch": 4.005299145299145, + "grad_norm": 0.3600796163082123, + "learning_rate": 4.5299145299145306e-06, + "loss": 0.5479, + "step": 1060 + }, + { + "epoch": 4.0057264957264955, + "grad_norm": 0.39198485016822815, + "learning_rate": 4.5726495726495725e-06, + "loss": 1.6426, + "step": 1070 + }, + { + "epoch": 4.006153846153846, + "grad_norm": 0.25071772933006287, + "learning_rate": 4.615384615384616e-06, + "loss": 1.0627, + "step": 1080 + }, + { + "epoch": 4.006581196581196, + "grad_norm": 35.45331573486328, + "learning_rate": 4.658119658119659e-06, + "loss": 2.8291, + "step": 1090 + }, + { + "epoch": 4.007008547008547, + "grad_norm": 0.4302135407924652, + "learning_rate": 4.700854700854701e-06, + "loss": 1.1957, + "step": 1100 + }, + { + "epoch": 4.007435897435897, + "grad_norm": 35.813323974609375, + "learning_rate": 4.743589743589744e-06, + "loss": 2.2429, + "step": 1110 + }, + { + "epoch": 4.007863247863248, + "grad_norm": 35.34008026123047, + "learning_rate": 4.786324786324787e-06, + "loss": 1.077, + "step": 1120 + }, + { + "epoch": 4.008290598290598, + "grad_norm": 1.468093991279602, + "learning_rate": 4.829059829059829e-06, + "loss": 2.7016, + "step": 1130 + }, + { + "epoch": 4.0087179487179485, + "grad_norm": 29.234296798706055, + "learning_rate": 4.871794871794872e-06, + "loss": 2.1301, + "step": 1140 + }, + { + "epoch": 4.009145299145299, + "grad_norm": 31.05660629272461, + "learning_rate": 4.914529914529915e-06, + "loss": 2.0786, + "step": 1150 + }, + { + "epoch": 4.009572649572649, + "grad_norm": 0.4601377546787262, + "learning_rate": 4.957264957264958e-06, + "loss": 2.0616, + "step": 1160 + }, + { + "epoch": 4.01, + "grad_norm": 1.4168052673339844, + "learning_rate": 5e-06, + "loss": 2.4333, + "step": 1170 + }, + { + "epoch": 4.01, + "eval_accuracy": 0.4, + "eval_loss": 2.6802823543548584, + "eval_runtime": 33.4138, + "eval_samples_per_second": 0.748, + "eval_steps_per_second": 0.748, + "step": 1170 + }, + { + "epoch": 5.00042735042735, + "grad_norm": 0.4595077633857727, + "learning_rate": 5.042735042735043e-06, + "loss": 0.8973, + "step": 1180 + }, + { + "epoch": 5.000854700854701, + "grad_norm": 32.307037353515625, + "learning_rate": 5.085470085470086e-06, + "loss": 1.9978, + "step": 1190 + }, + { + "epoch": 5.001282051282051, + "grad_norm": 33.618709564208984, + "learning_rate": 5.128205128205128e-06, + "loss": 3.0544, + "step": 1200 + }, + { + "epoch": 5.001709401709402, + "grad_norm": 31.92491340637207, + "learning_rate": 5.1709401709401716e-06, + "loss": 1.8459, + "step": 1210 + }, + { + "epoch": 5.002136752136752, + "grad_norm": 0.769111156463623, + "learning_rate": 5.213675213675214e-06, + "loss": 1.928, + "step": 1220 + }, + { + "epoch": 5.0025641025641026, + "grad_norm": 30.650548934936523, + "learning_rate": 5.256410256410257e-06, + "loss": 1.1278, + "step": 1230 + }, + { + "epoch": 5.002991452991453, + "grad_norm": 31.011295318603516, + "learning_rate": 5.2991452991453e-06, + "loss": 1.8213, + "step": 1240 + }, + { + "epoch": 5.003418803418803, + "grad_norm": 0.40149080753326416, + "learning_rate": 5.341880341880342e-06, + "loss": 2.31, + "step": 1250 + }, + { + "epoch": 5.003846153846154, + "grad_norm": 0.4121752977371216, + "learning_rate": 5.384615384615385e-06, + "loss": 1.051, + "step": 1260 + }, + { + "epoch": 5.004273504273504, + "grad_norm": 0.29824018478393555, + "learning_rate": 5.4273504273504275e-06, + "loss": 2.0995, + "step": 1270 + }, + { + "epoch": 5.004700854700855, + "grad_norm": 0.4708477258682251, + "learning_rate": 5.470085470085471e-06, + "loss": 0.4382, + "step": 1280 + }, + { + "epoch": 5.005128205128205, + "grad_norm": 0.25618794560432434, + "learning_rate": 5.512820512820514e-06, + "loss": 2.2577, + "step": 1290 + }, + { + "epoch": 5.0055555555555555, + "grad_norm": 0.17655441164970398, + "learning_rate": 5.555555555555557e-06, + "loss": 0.6334, + "step": 1300 + }, + { + "epoch": 5.005982905982906, + "grad_norm": 29.26540184020996, + "learning_rate": 5.598290598290599e-06, + "loss": 1.6359, + "step": 1310 + }, + { + "epoch": 5.006410256410256, + "grad_norm": 0.33771806955337524, + "learning_rate": 5.641025641025641e-06, + "loss": 2.2658, + "step": 1320 + }, + { + "epoch": 5.006837606837607, + "grad_norm": 0.19950927793979645, + "learning_rate": 5.683760683760684e-06, + "loss": 0.6106, + "step": 1330 + }, + { + "epoch": 5.007264957264957, + "grad_norm": 32.63151931762695, + "learning_rate": 5.726495726495727e-06, + "loss": 1.9519, + "step": 1340 + }, + { + "epoch": 5.007692307692308, + "grad_norm": 0.3686060309410095, + "learning_rate": 5.769230769230769e-06, + "loss": 1.2481, + "step": 1350 + }, + { + "epoch": 5.008119658119658, + "grad_norm": 0.1446453183889389, + "learning_rate": 5.8119658119658126e-06, + "loss": 0.5926, + "step": 1360 + }, + { + "epoch": 5.0085470085470085, + "grad_norm": 32.86191177368164, + "learning_rate": 5.854700854700855e-06, + "loss": 2.247, + "step": 1370 + }, + { + "epoch": 5.008974358974359, + "grad_norm": 37.676414489746094, + "learning_rate": 5.897435897435898e-06, + "loss": 1.9409, + "step": 1380 + }, + { + "epoch": 5.009401709401709, + "grad_norm": 0.4700198471546173, + "learning_rate": 5.940170940170941e-06, + "loss": 1.4757, + "step": 1390 + }, + { + "epoch": 5.00982905982906, + "grad_norm": 0.603115975856781, + "learning_rate": 5.982905982905983e-06, + "loss": 1.4646, + "step": 1400 + }, + { + "epoch": 5.01, + "eval_accuracy": 0.4, + "eval_loss": 3.5354690551757812, + "eval_runtime": 33.4888, + "eval_samples_per_second": 0.747, + "eval_steps_per_second": 0.747, + "step": 1404 + }, + { + "epoch": 6.00025641025641, + "grad_norm": 0.20566977560520172, + "learning_rate": 6.025641025641026e-06, + "loss": 2.5308, + "step": 1410 + }, + { + "epoch": 6.000683760683761, + "grad_norm": 1.3430235385894775, + "learning_rate": 6.0683760683760684e-06, + "loss": 1.7984, + "step": 1420 + }, + { + "epoch": 6.001111111111111, + "grad_norm": 34.01142120361328, + "learning_rate": 6.111111111111112e-06, + "loss": 0.5602, + "step": 1430 + }, + { + "epoch": 6.001538461538462, + "grad_norm": 0.08811581879854202, + "learning_rate": 6.153846153846155e-06, + "loss": 0.5698, + "step": 1440 + }, + { + "epoch": 6.001965811965812, + "grad_norm": 31.072364807128906, + "learning_rate": 6.196581196581198e-06, + "loss": 1.9642, + "step": 1450 + }, + { + "epoch": 6.002393162393163, + "grad_norm": 0.3049279749393463, + "learning_rate": 6.23931623931624e-06, + "loss": 1.0951, + "step": 1460 + }, + { + "epoch": 6.002820512820513, + "grad_norm": 1.261183500289917, + "learning_rate": 6.282051282051282e-06, + "loss": 1.5718, + "step": 1470 + }, + { + "epoch": 6.003247863247863, + "grad_norm": 29.76000213623047, + "learning_rate": 6.324786324786325e-06, + "loss": 2.3047, + "step": 1480 + }, + { + "epoch": 6.003675213675214, + "grad_norm": 0.3626205325126648, + "learning_rate": 6.367521367521368e-06, + "loss": 1.7411, + "step": 1490 + }, + { + "epoch": 6.004102564102564, + "grad_norm": 27.959877014160156, + "learning_rate": 6.410256410256412e-06, + "loss": 2.4126, + "step": 1500 + }, + { + "epoch": 6.004529914529915, + "grad_norm": 0.5188838243484497, + "learning_rate": 6.4529914529914535e-06, + "loss": 1.3862, + "step": 1510 + }, + { + "epoch": 6.004957264957265, + "grad_norm": 0.29587042331695557, + "learning_rate": 6.495726495726496e-06, + "loss": 1.414, + "step": 1520 + }, + { + "epoch": 6.0053846153846155, + "grad_norm": 0.25534921884536743, + "learning_rate": 6.538461538461539e-06, + "loss": 1.7212, + "step": 1530 + }, + { + "epoch": 6.005811965811966, + "grad_norm": 0.2307765781879425, + "learning_rate": 6.581196581196582e-06, + "loss": 2.294, + "step": 1540 + }, + { + "epoch": 6.006239316239316, + "grad_norm": 28.337125778198242, + "learning_rate": 6.623931623931624e-06, + "loss": 0.946, + "step": 1550 + }, + { + "epoch": 6.006666666666667, + "grad_norm": 0.16784150898456573, + "learning_rate": 6.666666666666667e-06, + "loss": 1.1255, + "step": 1560 + }, + { + "epoch": 6.007094017094017, + "grad_norm": 27.95771598815918, + "learning_rate": 6.7094017094017094e-06, + "loss": 1.6833, + "step": 1570 + }, + { + "epoch": 6.007521367521368, + "grad_norm": 0.4080459773540497, + "learning_rate": 6.752136752136753e-06, + "loss": 1.3406, + "step": 1580 + }, + { + "epoch": 6.007948717948718, + "grad_norm": 0.8113716244697571, + "learning_rate": 6.794871794871796e-06, + "loss": 2.6112, + "step": 1590 + }, + { + "epoch": 6.0083760683760685, + "grad_norm": 30.46502685546875, + "learning_rate": 6.837606837606839e-06, + "loss": 1.4458, + "step": 1600 + }, + { + "epoch": 6.008803418803419, + "grad_norm": 0.22283227741718292, + "learning_rate": 6.880341880341881e-06, + "loss": 0.5283, + "step": 1610 + }, + { + "epoch": 6.009230769230769, + "grad_norm": 28.292848587036133, + "learning_rate": 6.923076923076923e-06, + "loss": 1.5511, + "step": 1620 + }, + { + "epoch": 6.00965811965812, + "grad_norm": 28.0618839263916, + "learning_rate": 6.965811965811966e-06, + "loss": 2.1201, + "step": 1630 + }, + { + "epoch": 6.01, + "eval_accuracy": 0.4, + "eval_loss": 3.0478899478912354, + "eval_runtime": 33.4068, + "eval_samples_per_second": 0.748, + "eval_steps_per_second": 0.748, + "step": 1638 + }, + { + "epoch": 7.00008547008547, + "grad_norm": 33.30900573730469, + "learning_rate": 7.008547008547009e-06, + "loss": 1.429, + "step": 1640 + }, + { + "epoch": 7.000512820512821, + "grad_norm": 29.006980895996094, + "learning_rate": 7.051282051282053e-06, + "loss": 1.6062, + "step": 1650 + }, + { + "epoch": 7.000940170940171, + "grad_norm": 28.740236282348633, + "learning_rate": 7.0940170940170945e-06, + "loss": 2.1064, + "step": 1660 + }, + { + "epoch": 7.001367521367522, + "grad_norm": 0.5548082590103149, + "learning_rate": 7.136752136752137e-06, + "loss": 1.4227, + "step": 1670 + }, + { + "epoch": 7.001794871794872, + "grad_norm": 0.2548494040966034, + "learning_rate": 7.17948717948718e-06, + "loss": 0.4677, + "step": 1680 + }, + { + "epoch": 7.002222222222223, + "grad_norm": 0.08594454079866409, + "learning_rate": 7.222222222222223e-06, + "loss": 0.6487, + "step": 1690 + }, + { + "epoch": 7.002649572649573, + "grad_norm": 0.11181553453207016, + "learning_rate": 7.264957264957266e-06, + "loss": 0.6856, + "step": 1700 + }, + { + "epoch": 7.003076923076923, + "grad_norm": 0.09680427610874176, + "learning_rate": 7.307692307692308e-06, + "loss": 1.2589, + "step": 1710 + }, + { + "epoch": 7.003504273504274, + "grad_norm": 0.18477901816368103, + "learning_rate": 7.350427350427351e-06, + "loss": 2.0357, + "step": 1720 + }, + { + "epoch": 7.003931623931624, + "grad_norm": 28.30834197998047, + "learning_rate": 7.393162393162394e-06, + "loss": 1.7978, + "step": 1730 + }, + { + "epoch": 7.004358974358975, + "grad_norm": 0.4138849675655365, + "learning_rate": 7.435897435897437e-06, + "loss": 1.4938, + "step": 1740 + }, + { + "epoch": 7.004786324786325, + "grad_norm": 46.25049591064453, + "learning_rate": 7.47863247863248e-06, + "loss": 1.2354, + "step": 1750 + }, + { + "epoch": 7.0052136752136756, + "grad_norm": 0.2702569365501404, + "learning_rate": 7.521367521367522e-06, + "loss": 1.8753, + "step": 1760 + }, + { + "epoch": 7.005641025641026, + "grad_norm": 0.2681766450405121, + "learning_rate": 7.564102564102564e-06, + "loss": 1.2446, + "step": 1770 + }, + { + "epoch": 7.006068376068376, + "grad_norm": 47.34070587158203, + "learning_rate": 7.606837606837607e-06, + "loss": 2.2493, + "step": 1780 + }, + { + "epoch": 7.006495726495727, + "grad_norm": 31.515134811401367, + "learning_rate": 7.649572649572649e-06, + "loss": 2.1708, + "step": 1790 + }, + { + "epoch": 7.006923076923077, + "grad_norm": 0.44051307439804077, + "learning_rate": 7.692307692307694e-06, + "loss": 0.9425, + "step": 1800 + }, + { + "epoch": 7.007350427350428, + "grad_norm": 0.16471922397613525, + "learning_rate": 7.735042735042736e-06, + "loss": 1.6227, + "step": 1810 + }, + { + "epoch": 7.007777777777778, + "grad_norm": 36.21978759765625, + "learning_rate": 7.77777777777778e-06, + "loss": 3.136, + "step": 1820 + }, + { + "epoch": 7.0082051282051285, + "grad_norm": 0.6311964988708496, + "learning_rate": 7.820512820512822e-06, + "loss": 1.6591, + "step": 1830 + }, + { + "epoch": 7.008632478632479, + "grad_norm": 0.15079300105571747, + "learning_rate": 7.863247863247863e-06, + "loss": 0.4923, + "step": 1840 + }, + { + "epoch": 7.009059829059829, + "grad_norm": 0.0793166533112526, + "learning_rate": 7.905982905982906e-06, + "loss": 0.6274, + "step": 1850 + }, + { + "epoch": 7.00948717948718, + "grad_norm": 39.972232818603516, + "learning_rate": 7.948717948717949e-06, + "loss": 1.902, + "step": 1860 + }, + { + "epoch": 7.00991452991453, + "grad_norm": 2.84613299369812, + "learning_rate": 7.991452991452993e-06, + "loss": 2.9021, + "step": 1870 + }, + { + "epoch": 7.01, + "eval_accuracy": 0.4, + "eval_loss": 2.8181352615356445, + "eval_runtime": 33.4387, + "eval_samples_per_second": 0.748, + "eval_steps_per_second": 0.748, + "step": 1872 + }, + { + "epoch": 8.00034188034188, + "grad_norm": 32.629676818847656, + "learning_rate": 8.034188034188036e-06, + "loss": 0.6487, + "step": 1880 + }, + { + "epoch": 8.000769230769231, + "grad_norm": 0.23920656740665436, + "learning_rate": 8.076923076923077e-06, + "loss": 1.2006, + "step": 1890 + }, + { + "epoch": 8.00119658119658, + "grad_norm": 0.14234939217567444, + "learning_rate": 8.11965811965812e-06, + "loss": 1.2065, + "step": 1900 + }, + { + "epoch": 8.001623931623932, + "grad_norm": 0.12460368871688843, + "learning_rate": 8.162393162393163e-06, + "loss": 0.5436, + "step": 1910 + }, + { + "epoch": 8.002051282051282, + "grad_norm": 0.43331626057624817, + "learning_rate": 8.205128205128205e-06, + "loss": 1.5192, + "step": 1920 + }, + { + "epoch": 8.002478632478633, + "grad_norm": 0.12986142933368683, + "learning_rate": 8.247863247863248e-06, + "loss": 0.6905, + "step": 1930 + }, + { + "epoch": 8.002905982905983, + "grad_norm": 0.5832127928733826, + "learning_rate": 8.290598290598293e-06, + "loss": 3.0564, + "step": 1940 + }, + { + "epoch": 8.003333333333334, + "grad_norm": 28.78748321533203, + "learning_rate": 8.333333333333334e-06, + "loss": 2.3034, + "step": 1950 + }, + { + "epoch": 8.003760683760683, + "grad_norm": 31.847166061401367, + "learning_rate": 8.376068376068377e-06, + "loss": 1.9432, + "step": 1960 + }, + { + "epoch": 8.004188034188035, + "grad_norm": 0.23129618167877197, + "learning_rate": 8.41880341880342e-06, + "loss": 0.8151, + "step": 1970 + }, + { + "epoch": 8.004615384615384, + "grad_norm": 0.08719867467880249, + "learning_rate": 8.461538461538462e-06, + "loss": 0.4525, + "step": 1980 + }, + { + "epoch": 8.005042735042736, + "grad_norm": 0.17510618269443512, + "learning_rate": 8.504273504273505e-06, + "loss": 2.3329, + "step": 1990 + }, + { + "epoch": 8.005470085470085, + "grad_norm": 0.23968863487243652, + "learning_rate": 8.547008547008548e-06, + "loss": 1.4327, + "step": 2000 + }, + { + "epoch": 8.005897435897436, + "grad_norm": 0.1258096992969513, + "learning_rate": 8.58974358974359e-06, + "loss": 1.5496, + "step": 2010 + }, + { + "epoch": 8.006324786324786, + "grad_norm": 33.13344192504883, + "learning_rate": 8.632478632478633e-06, + "loss": 1.8633, + "step": 2020 + }, + { + "epoch": 8.006752136752137, + "grad_norm": 34.57917022705078, + "learning_rate": 8.675213675213676e-06, + "loss": 2.1156, + "step": 2030 + }, + { + "epoch": 8.007179487179487, + "grad_norm": 30.499267578125, + "learning_rate": 8.717948717948719e-06, + "loss": 1.4366, + "step": 2040 + }, + { + "epoch": 8.007606837606838, + "grad_norm": 0.2296626716852188, + "learning_rate": 8.760683760683762e-06, + "loss": 0.5388, + "step": 2050 + }, + { + "epoch": 8.008034188034188, + "grad_norm": 0.6007127165794373, + "learning_rate": 8.803418803418804e-06, + "loss": 2.0019, + "step": 2060 + }, + { + "epoch": 8.008461538461539, + "grad_norm": 29.225831985473633, + "learning_rate": 8.846153846153847e-06, + "loss": 1.6262, + "step": 2070 + }, + { + "epoch": 8.008888888888889, + "grad_norm": 0.9478847980499268, + "learning_rate": 8.888888888888888e-06, + "loss": 2.0313, + "step": 2080 + }, + { + "epoch": 8.00931623931624, + "grad_norm": 0.12494786083698273, + "learning_rate": 8.931623931623933e-06, + "loss": 0.9103, + "step": 2090 + }, + { + "epoch": 8.00974358974359, + "grad_norm": 23.248950958251953, + "learning_rate": 8.974358974358976e-06, + "loss": 2.1527, + "step": 2100 + }, + { + "epoch": 8.01, + "eval_accuracy": 0.4, + "eval_loss": 2.7605271339416504, + "eval_runtime": 32.6542, + "eval_samples_per_second": 0.766, + "eval_steps_per_second": 0.766, + "step": 2106 + }, + { + "epoch": 9.00017094017094, + "grad_norm": 25.770069122314453, + "learning_rate": 9.017094017094018e-06, + "loss": 1.5706, + "step": 2110 + }, + { + "epoch": 9.00059829059829, + "grad_norm": 0.7022117376327515, + "learning_rate": 9.059829059829061e-06, + "loss": 0.4205, + "step": 2120 + }, + { + "epoch": 9.001025641025642, + "grad_norm": 0.08867479860782623, + "learning_rate": 9.102564102564104e-06, + "loss": 1.2611, + "step": 2130 + }, + { + "epoch": 9.001452991452991, + "grad_norm": 0.09678523242473602, + "learning_rate": 9.145299145299145e-06, + "loss": 1.1773, + "step": 2140 + }, + { + "epoch": 9.001880341880343, + "grad_norm": 0.15343564748764038, + "learning_rate": 9.188034188034188e-06, + "loss": 1.2586, + "step": 2150 + }, + { + "epoch": 9.002307692307692, + "grad_norm": 0.35261672735214233, + "learning_rate": 9.230769230769232e-06, + "loss": 0.8426, + "step": 2160 + }, + { + "epoch": 9.002735042735043, + "grad_norm": 0.2280280739068985, + "learning_rate": 9.273504273504275e-06, + "loss": 1.1276, + "step": 2170 + }, + { + "epoch": 9.003162393162393, + "grad_norm": 0.11199548840522766, + "learning_rate": 9.316239316239318e-06, + "loss": 2.7832, + "step": 2180 + }, + { + "epoch": 9.003589743589744, + "grad_norm": 0.2621850073337555, + "learning_rate": 9.358974358974359e-06, + "loss": 0.9458, + "step": 2190 + }, + { + "epoch": 9.004017094017094, + "grad_norm": 0.36012938618659973, + "learning_rate": 9.401709401709402e-06, + "loss": 1.1242, + "step": 2200 + }, + { + "epoch": 9.004444444444445, + "grad_norm": 48.78242492675781, + "learning_rate": 9.444444444444445e-06, + "loss": 3.3251, + "step": 2210 + }, + { + "epoch": 9.004871794871795, + "grad_norm": 0.15066808462142944, + "learning_rate": 9.487179487179487e-06, + "loss": 1.708, + "step": 2220 + }, + { + "epoch": 9.005299145299146, + "grad_norm": 0.4691851735115051, + "learning_rate": 9.52991452991453e-06, + "loss": 1.553, + "step": 2230 + }, + { + "epoch": 9.005726495726496, + "grad_norm": 28.600677490234375, + "learning_rate": 9.572649572649575e-06, + "loss": 1.2926, + "step": 2240 + }, + { + "epoch": 9.006153846153847, + "grad_norm": 32.24127960205078, + "learning_rate": 9.615384615384616e-06, + "loss": 2.4733, + "step": 2250 + }, + { + "epoch": 9.006581196581196, + "grad_norm": 0.7631069421768188, + "learning_rate": 9.658119658119659e-06, + "loss": 1.7091, + "step": 2260 + }, + { + "epoch": 9.007008547008548, + "grad_norm": 0.2995574176311493, + "learning_rate": 9.700854700854701e-06, + "loss": 2.0053, + "step": 2270 + }, + { + "epoch": 9.007435897435897, + "grad_norm": 3.1298446655273438, + "learning_rate": 9.743589743589744e-06, + "loss": 1.1043, + "step": 2280 + }, + { + "epoch": 9.007863247863249, + "grad_norm": 0.1203409880399704, + "learning_rate": 9.786324786324787e-06, + "loss": 0.6468, + "step": 2290 + }, + { + "epoch": 9.008290598290598, + "grad_norm": 0.3753611445426941, + "learning_rate": 9.82905982905983e-06, + "loss": 1.8003, + "step": 2300 + }, + { + "epoch": 9.00871794871795, + "grad_norm": 0.24749663472175598, + "learning_rate": 9.871794871794872e-06, + "loss": 0.6302, + "step": 2310 + }, + { + "epoch": 9.009145299145299, + "grad_norm": 0.15980985760688782, + "learning_rate": 9.914529914529915e-06, + "loss": 1.7506, + "step": 2320 + }, + { + "epoch": 9.00957264957265, + "grad_norm": 30.294530868530273, + "learning_rate": 9.957264957264958e-06, + "loss": 2.5406, + "step": 2330 + }, + { + "epoch": 9.01, + "grad_norm": 1.435558557510376, + "learning_rate": 1e-05, + "loss": 1.9428, + "step": 2340 + }, + { + "epoch": 9.01, + "eval_accuracy": 0.4, + "eval_loss": 2.4512882232666016, + "eval_runtime": 32.6431, + "eval_samples_per_second": 0.766, + "eval_steps_per_second": 0.766, + "step": 2340 + }, + { + "epoch": 10.000427350427351, + "grad_norm": 26.23455810546875, + "learning_rate": 9.99525166191833e-06, + "loss": 0.9502, + "step": 2350 + }, + { + "epoch": 10.0008547008547, + "grad_norm": 28.52158546447754, + "learning_rate": 9.990503323836657e-06, + "loss": 1.7874, + "step": 2360 + }, + { + "epoch": 10.001282051282052, + "grad_norm": 0.10940665751695633, + "learning_rate": 9.985754985754987e-06, + "loss": 1.1145, + "step": 2370 + }, + { + "epoch": 10.001709401709402, + "grad_norm": 0.49054867029190063, + "learning_rate": 9.981006647673314e-06, + "loss": 2.1067, + "step": 2380 + }, + { + "epoch": 10.002136752136753, + "grad_norm": 0.4312061071395874, + "learning_rate": 9.976258309591643e-06, + "loss": 1.4459, + "step": 2390 + }, + { + "epoch": 10.002564102564103, + "grad_norm": 0.9604039788246155, + "learning_rate": 9.971509971509972e-06, + "loss": 1.5741, + "step": 2400 + }, + { + "epoch": 10.002991452991454, + "grad_norm": 28.289756774902344, + "learning_rate": 9.966761633428301e-06, + "loss": 3.0157, + "step": 2410 + }, + { + "epoch": 10.003418803418803, + "grad_norm": 1.3765814304351807, + "learning_rate": 9.96201329534663e-06, + "loss": 1.7118, + "step": 2420 + }, + { + "epoch": 10.003846153846155, + "grad_norm": 0.4429996609687805, + "learning_rate": 9.957264957264958e-06, + "loss": 1.8346, + "step": 2430 + }, + { + "epoch": 10.004273504273504, + "grad_norm": 0.1344616711139679, + "learning_rate": 9.952516619183287e-06, + "loss": 1.1072, + "step": 2440 + }, + { + "epoch": 10.004700854700856, + "grad_norm": 0.2778800129890442, + "learning_rate": 9.947768281101615e-06, + "loss": 1.0562, + "step": 2450 + }, + { + "epoch": 10.005128205128205, + "grad_norm": 41.122074127197266, + "learning_rate": 9.943019943019944e-06, + "loss": 2.096, + "step": 2460 + }, + { + "epoch": 10.005555555555556, + "grad_norm": 0.6196286082267761, + "learning_rate": 9.938271604938273e-06, + "loss": 1.9945, + "step": 2470 + }, + { + "epoch": 10.005982905982906, + "grad_norm": 38.4301872253418, + "learning_rate": 9.9335232668566e-06, + "loss": 1.5745, + "step": 2480 + }, + { + "epoch": 10.006410256410257, + "grad_norm": 0.5608100295066833, + "learning_rate": 9.92877492877493e-06, + "loss": 1.7862, + "step": 2490 + }, + { + "epoch": 10.006837606837607, + "grad_norm": 0.08879340440034866, + "learning_rate": 9.924026590693259e-06, + "loss": 1.2266, + "step": 2500 + }, + { + "epoch": 10.007264957264958, + "grad_norm": 0.14399921894073486, + "learning_rate": 9.919278252611588e-06, + "loss": 1.5128, + "step": 2510 + }, + { + "epoch": 10.007692307692308, + "grad_norm": 0.567003071308136, + "learning_rate": 9.914529914529915e-06, + "loss": 0.5539, + "step": 2520 + }, + { + "epoch": 10.008119658119659, + "grad_norm": 0.1469346433877945, + "learning_rate": 9.909781576448244e-06, + "loss": 1.3608, + "step": 2530 + }, + { + "epoch": 10.008547008547009, + "grad_norm": 0.6318036913871765, + "learning_rate": 9.905033238366572e-06, + "loss": 0.6639, + "step": 2540 + }, + { + "epoch": 10.00897435897436, + "grad_norm": 0.13676932454109192, + "learning_rate": 9.900284900284901e-06, + "loss": 2.7342, + "step": 2550 + }, + { + "epoch": 10.00940170940171, + "grad_norm": 28.927698135375977, + "learning_rate": 9.89553656220323e-06, + "loss": 2.5179, + "step": 2560 + }, + { + "epoch": 10.00982905982906, + "grad_norm": 26.96585464477539, + "learning_rate": 9.890788224121558e-06, + "loss": 1.6949, + "step": 2570 + }, + { + "epoch": 10.01, + "eval_accuracy": 0.4, + "eval_loss": 3.2310287952423096, + "eval_runtime": 32.7403, + "eval_samples_per_second": 0.764, + "eval_steps_per_second": 0.764, + "step": 2574 + }, + { + "epoch": 11.00025641025641, + "grad_norm": 29.56972312927246, + "learning_rate": 9.886039886039887e-06, + "loss": 2.0711, + "step": 2580 + }, + { + "epoch": 11.00068376068376, + "grad_norm": 1.1823328733444214, + "learning_rate": 9.881291547958214e-06, + "loss": 1.3947, + "step": 2590 + }, + { + "epoch": 11.001111111111111, + "grad_norm": 34.95638656616211, + "learning_rate": 9.876543209876543e-06, + "loss": 1.3976, + "step": 2600 + }, + { + "epoch": 11.00153846153846, + "grad_norm": 0.5684574842453003, + "learning_rate": 9.871794871794872e-06, + "loss": 0.5828, + "step": 2610 + }, + { + "epoch": 11.001965811965812, + "grad_norm": 0.13000904023647308, + "learning_rate": 9.867046533713202e-06, + "loss": 0.547, + "step": 2620 + }, + { + "epoch": 11.002393162393162, + "grad_norm": 0.08917149901390076, + "learning_rate": 9.86229819563153e-06, + "loss": 2.5733, + "step": 2630 + }, + { + "epoch": 11.002820512820513, + "grad_norm": 27.73358154296875, + "learning_rate": 9.857549857549858e-06, + "loss": 3.1395, + "step": 2640 + }, + { + "epoch": 11.003247863247863, + "grad_norm": 24.41839027404785, + "learning_rate": 9.852801519468187e-06, + "loss": 1.2438, + "step": 2650 + }, + { + "epoch": 11.003675213675214, + "grad_norm": 3.812605142593384, + "learning_rate": 9.848053181386515e-06, + "loss": 1.9698, + "step": 2660 + }, + { + "epoch": 11.004102564102563, + "grad_norm": 1.3905110359191895, + "learning_rate": 9.843304843304844e-06, + "loss": 1.0108, + "step": 2670 + }, + { + "epoch": 11.004529914529915, + "grad_norm": 0.7647677063941956, + "learning_rate": 9.838556505223173e-06, + "loss": 2.3101, + "step": 2680 + }, + { + "epoch": 11.004957264957264, + "grad_norm": 31.24732780456543, + "learning_rate": 9.8338081671415e-06, + "loss": 2.3872, + "step": 2690 + }, + { + "epoch": 11.005384615384616, + "grad_norm": 29.57086944580078, + "learning_rate": 9.82905982905983e-06, + "loss": 1.7839, + "step": 2700 + }, + { + "epoch": 11.005811965811965, + "grad_norm": 0.18307341635227203, + "learning_rate": 9.824311490978159e-06, + "loss": 1.4724, + "step": 2710 + }, + { + "epoch": 11.006239316239316, + "grad_norm": 0.0853017047047615, + "learning_rate": 9.819563152896488e-06, + "loss": 0.2625, + "step": 2720 + }, + { + "epoch": 11.006666666666666, + "grad_norm": 0.10152238607406616, + "learning_rate": 9.814814814814815e-06, + "loss": 1.0233, + "step": 2730 + }, + { + "epoch": 11.007094017094017, + "grad_norm": 4.792705535888672, + "learning_rate": 9.810066476733145e-06, + "loss": 2.28, + "step": 2740 + }, + { + "epoch": 11.007521367521367, + "grad_norm": 0.5689181685447693, + "learning_rate": 9.805318138651474e-06, + "loss": 1.3607, + "step": 2750 + }, + { + "epoch": 11.007948717948718, + "grad_norm": 0.07235058397054672, + "learning_rate": 9.800569800569801e-06, + "loss": 0.5546, + "step": 2760 + }, + { + "epoch": 11.008376068376068, + "grad_norm": 0.18848180770874023, + "learning_rate": 9.79582146248813e-06, + "loss": 0.0045, + "step": 2770 + }, + { + "epoch": 11.008803418803419, + "grad_norm": 0.14690843224525452, + "learning_rate": 9.791073124406458e-06, + "loss": 0.6331, + "step": 2780 + }, + { + "epoch": 11.009230769230768, + "grad_norm": 35.11496353149414, + "learning_rate": 9.786324786324787e-06, + "loss": 1.3858, + "step": 2790 + }, + { + "epoch": 11.00965811965812, + "grad_norm": 0.35278671979904175, + "learning_rate": 9.781576448243116e-06, + "loss": 0.7839, + "step": 2800 + }, + { + "epoch": 11.01, + "eval_accuracy": 0.4, + "eval_loss": 3.237220525741577, + "eval_runtime": 31.3652, + "eval_samples_per_second": 0.797, + "eval_steps_per_second": 0.797, + "step": 2808 + }, + { + "epoch": 12.00008547008547, + "grad_norm": 0.31852149963378906, + "learning_rate": 9.776828110161445e-06, + "loss": 1.8794, + "step": 2810 + }, + { + "epoch": 12.00051282051282, + "grad_norm": 1.0761709213256836, + "learning_rate": 9.772079772079773e-06, + "loss": 1.9101, + "step": 2820 + }, + { + "epoch": 12.00094017094017, + "grad_norm": 0.06357964873313904, + "learning_rate": 9.767331433998102e-06, + "loss": 1.1383, + "step": 2830 + }, + { + "epoch": 12.001367521367522, + "grad_norm": 54.093082427978516, + "learning_rate": 9.762583095916431e-06, + "loss": 2.112, + "step": 2840 + }, + { + "epoch": 12.001794871794871, + "grad_norm": 0.2823314070701599, + "learning_rate": 9.757834757834758e-06, + "loss": 0.5963, + "step": 2850 + }, + { + "epoch": 12.002222222222223, + "grad_norm": 0.3854888677597046, + "learning_rate": 9.753086419753087e-06, + "loss": 0.5196, + "step": 2860 + }, + { + "epoch": 12.002649572649572, + "grad_norm": 32.18788146972656, + "learning_rate": 9.748338081671415e-06, + "loss": 3.0031, + "step": 2870 + }, + { + "epoch": 12.003076923076923, + "grad_norm": 1.2660186290740967, + "learning_rate": 9.743589743589744e-06, + "loss": 1.9453, + "step": 2880 + }, + { + "epoch": 12.003504273504273, + "grad_norm": 49.34856033325195, + "learning_rate": 9.738841405508073e-06, + "loss": 2.1836, + "step": 2890 + }, + { + "epoch": 12.003931623931624, + "grad_norm": 3.923487663269043, + "learning_rate": 9.7340930674264e-06, + "loss": 0.6615, + "step": 2900 + }, + { + "epoch": 12.004358974358974, + "grad_norm": 0.09915890544652939, + "learning_rate": 9.72934472934473e-06, + "loss": 1.8817, + "step": 2910 + }, + { + "epoch": 12.004786324786325, + "grad_norm": 0.21953581273555756, + "learning_rate": 9.724596391263059e-06, + "loss": 2.0783, + "step": 2920 + }, + { + "epoch": 12.005213675213675, + "grad_norm": 0.0524006113409996, + "learning_rate": 9.719848053181388e-06, + "loss": 1.1279, + "step": 2930 + }, + { + "epoch": 12.005641025641026, + "grad_norm": 56.099334716796875, + "learning_rate": 9.715099715099716e-06, + "loss": 1.4809, + "step": 2940 + }, + { + "epoch": 12.006068376068376, + "grad_norm": 0.5895985960960388, + "learning_rate": 9.710351377018045e-06, + "loss": 0.5329, + "step": 2950 + }, + { + "epoch": 12.006495726495727, + "grad_norm": 42.8546257019043, + "learning_rate": 9.705603038936374e-06, + "loss": 2.1514, + "step": 2960 + }, + { + "epoch": 12.006923076923076, + "grad_norm": 0.03076835907995701, + "learning_rate": 9.700854700854701e-06, + "loss": 0.4221, + "step": 2970 + }, + { + "epoch": 12.007350427350428, + "grad_norm": 0.02532465010881424, + "learning_rate": 9.69610636277303e-06, + "loss": 2.3099, + "step": 2980 + }, + { + "epoch": 12.007777777777777, + "grad_norm": 62.39934158325195, + "learning_rate": 9.691358024691358e-06, + "loss": 0.6646, + "step": 2990 + }, + { + "epoch": 12.008205128205129, + "grad_norm": 54.54960632324219, + "learning_rate": 9.686609686609687e-06, + "loss": 0.4507, + "step": 3000 + }, + { + "epoch": 12.008632478632478, + "grad_norm": 1.0829317569732666, + "learning_rate": 9.681861348528016e-06, + "loss": 1.9857, + "step": 3010 + }, + { + "epoch": 12.00905982905983, + "grad_norm": 11.404279708862305, + "learning_rate": 9.677113010446345e-06, + "loss": 1.1354, + "step": 3020 + }, + { + "epoch": 12.009487179487179, + "grad_norm": 0.041183874011039734, + "learning_rate": 9.672364672364673e-06, + "loss": 0.8741, + "step": 3030 + }, + { + "epoch": 12.00991452991453, + "grad_norm": 19.812788009643555, + "learning_rate": 9.667616334283002e-06, + "loss": 0.3228, + "step": 3040 + }, + { + "epoch": 12.01, + "eval_accuracy": 0.4, + "eval_loss": 4.458773136138916, + "eval_runtime": 31.367, + "eval_samples_per_second": 0.797, + "eval_steps_per_second": 0.797, + "step": 3042 + }, + { + "epoch": 13.00034188034188, + "grad_norm": 49.4742317199707, + "learning_rate": 9.662867996201331e-06, + "loss": 2.7291, + "step": 3050 + }, + { + "epoch": 13.000769230769231, + "grad_norm": 33.5203857421875, + "learning_rate": 9.658119658119659e-06, + "loss": 1.3205, + "step": 3060 + }, + { + "epoch": 13.00119658119658, + "grad_norm": 32.33980941772461, + "learning_rate": 9.653371320037988e-06, + "loss": 1.9122, + "step": 3070 + }, + { + "epoch": 13.001623931623932, + "grad_norm": 0.2694286108016968, + "learning_rate": 9.648622981956317e-06, + "loss": 1.0847, + "step": 3080 + }, + { + "epoch": 13.002051282051282, + "grad_norm": 0.17534177005290985, + "learning_rate": 9.643874643874644e-06, + "loss": 1.2066, + "step": 3090 + }, + { + "epoch": 13.002478632478633, + "grad_norm": 0.06266115605831146, + "learning_rate": 9.639126305792973e-06, + "loss": 1.0707, + "step": 3100 + }, + { + "epoch": 13.002905982905983, + "grad_norm": 28.773160934448242, + "learning_rate": 9.6343779677113e-06, + "loss": 3.3066, + "step": 3110 + }, + { + "epoch": 13.003333333333334, + "grad_norm": 36.008384704589844, + "learning_rate": 9.62962962962963e-06, + "loss": 1.0375, + "step": 3120 + }, + { + "epoch": 13.003760683760683, + "grad_norm": 0.03540613874793053, + "learning_rate": 9.624881291547959e-06, + "loss": 0.5259, + "step": 3130 + }, + { + "epoch": 13.004188034188035, + "grad_norm": 0.03709403797984123, + "learning_rate": 9.620132953466288e-06, + "loss": 0.0029, + "step": 3140 + }, + { + "epoch": 13.004615384615384, + "grad_norm": 0.5573925375938416, + "learning_rate": 9.615384615384616e-06, + "loss": 1.7656, + "step": 3150 + }, + { + "epoch": 13.005042735042736, + "grad_norm": 0.03247790411114693, + "learning_rate": 9.610636277302945e-06, + "loss": 1.6948, + "step": 3160 + }, + { + "epoch": 13.005470085470085, + "grad_norm": 32.801788330078125, + "learning_rate": 9.605887939221274e-06, + "loss": 2.2156, + "step": 3170 + }, + { + "epoch": 13.005897435897436, + "grad_norm": 3.8869693279266357, + "learning_rate": 9.601139601139601e-06, + "loss": 1.0666, + "step": 3180 + }, + { + "epoch": 13.006324786324786, + "grad_norm": 0.8103746175765991, + "learning_rate": 9.59639126305793e-06, + "loss": 1.5307, + "step": 3190 + }, + { + "epoch": 13.006752136752137, + "grad_norm": 10.717965126037598, + "learning_rate": 9.591642924976258e-06, + "loss": 1.6417, + "step": 3200 + }, + { + "epoch": 13.007179487179487, + "grad_norm": 41.972896575927734, + "learning_rate": 9.586894586894587e-06, + "loss": 0.5114, + "step": 3210 + }, + { + "epoch": 13.007606837606838, + "grad_norm": 79.40288543701172, + "learning_rate": 9.582146248812916e-06, + "loss": 1.7693, + "step": 3220 + }, + { + "epoch": 13.008034188034188, + "grad_norm": 0.05152087286114693, + "learning_rate": 9.577397910731245e-06, + "loss": 1.1784, + "step": 3230 + }, + { + "epoch": 13.008461538461539, + "grad_norm": 0.10308440029621124, + "learning_rate": 9.572649572649575e-06, + "loss": 1.9728, + "step": 3240 + }, + { + "epoch": 13.008888888888889, + "grad_norm": 0.10380467772483826, + "learning_rate": 9.567901234567902e-06, + "loss": 1.1541, + "step": 3250 + }, + { + "epoch": 13.00931623931624, + "grad_norm": 0.8398118019104004, + "learning_rate": 9.563152896486231e-06, + "loss": 0.5554, + "step": 3260 + }, + { + "epoch": 13.00974358974359, + "grad_norm": 41.57830810546875, + "learning_rate": 9.558404558404559e-06, + "loss": 3.5377, + "step": 3270 + }, + { + "epoch": 13.01, + "eval_accuracy": 0.4, + "eval_loss": 2.862149238586426, + "eval_runtime": 31.9079, + "eval_samples_per_second": 0.784, + "eval_steps_per_second": 0.784, + "step": 3276 + }, + { + "epoch": 14.00017094017094, + "grad_norm": 37.766448974609375, + "learning_rate": 9.553656220322888e-06, + "loss": 1.0101, + "step": 3280 + }, + { + "epoch": 14.00059829059829, + "grad_norm": 0.0569000206887722, + "learning_rate": 9.548907882241217e-06, + "loss": 0.5743, + "step": 3290 + }, + { + "epoch": 14.001025641025642, + "grad_norm": 0.07739593833684921, + "learning_rate": 9.544159544159544e-06, + "loss": 0.6754, + "step": 3300 + }, + { + "epoch": 14.001452991452991, + "grad_norm": 0.08230680227279663, + "learning_rate": 9.539411206077874e-06, + "loss": 2.4253, + "step": 3310 + }, + { + "epoch": 14.001880341880343, + "grad_norm": 64.62518310546875, + "learning_rate": 9.534662867996201e-06, + "loss": 2.6924, + "step": 3320 + }, + { + "epoch": 14.002307692307692, + "grad_norm": 6.906940460205078, + "learning_rate": 9.52991452991453e-06, + "loss": 1.5079, + "step": 3330 + }, + { + "epoch": 14.002735042735043, + "grad_norm": 42.747108459472656, + "learning_rate": 9.52516619183286e-06, + "loss": 0.4776, + "step": 3340 + }, + { + "epoch": 14.003162393162393, + "grad_norm": 0.0985599160194397, + "learning_rate": 9.520417853751188e-06, + "loss": 1.2448, + "step": 3350 + }, + { + "epoch": 14.003589743589744, + "grad_norm": 43.92565155029297, + "learning_rate": 9.515669515669516e-06, + "loss": 1.1799, + "step": 3360 + }, + { + "epoch": 14.004017094017094, + "grad_norm": 0.21025124192237854, + "learning_rate": 9.510921177587845e-06, + "loss": 1.5539, + "step": 3370 + }, + { + "epoch": 14.004444444444445, + "grad_norm": 0.03208040073513985, + "learning_rate": 9.506172839506174e-06, + "loss": 0.0046, + "step": 3380 + }, + { + "epoch": 14.004871794871795, + "grad_norm": 0.054694049060344696, + "learning_rate": 9.501424501424502e-06, + "loss": 1.6939, + "step": 3390 + }, + { + "epoch": 14.005299145299146, + "grad_norm": 70.33654022216797, + "learning_rate": 9.49667616334283e-06, + "loss": 1.1934, + "step": 3400 + }, + { + "epoch": 14.005726495726496, + "grad_norm": 22.107633590698242, + "learning_rate": 9.49192782526116e-06, + "loss": 1.3934, + "step": 3410 + }, + { + "epoch": 14.006153846153847, + "grad_norm": 24.67310905456543, + "learning_rate": 9.487179487179487e-06, + "loss": 1.545, + "step": 3420 + }, + { + "epoch": 14.006581196581196, + "grad_norm": 15.487519264221191, + "learning_rate": 9.482431149097816e-06, + "loss": 1.0724, + "step": 3430 + }, + { + "epoch": 14.007008547008548, + "grad_norm": 8.748615264892578, + "learning_rate": 9.477682811016146e-06, + "loss": 1.1913, + "step": 3440 + }, + { + "epoch": 14.007435897435897, + "grad_norm": 0.04509786143898964, + "learning_rate": 9.472934472934475e-06, + "loss": 0.8465, + "step": 3450 + }, + { + "epoch": 14.007863247863249, + "grad_norm": 0.052408602088689804, + "learning_rate": 9.468186134852802e-06, + "loss": 0.9335, + "step": 3460 + }, + { + "epoch": 14.008290598290598, + "grad_norm": 4.355685234069824, + "learning_rate": 9.463437796771131e-06, + "loss": 1.4667, + "step": 3470 + }, + { + "epoch": 14.00871794871795, + "grad_norm": 31.14105796813965, + "learning_rate": 9.458689458689459e-06, + "loss": 1.9456, + "step": 3480 + }, + { + "epoch": 14.009145299145299, + "grad_norm": 5.5146284103393555, + "learning_rate": 9.453941120607788e-06, + "loss": 2.4975, + "step": 3490 + }, + { + "epoch": 14.00957264957265, + "grad_norm": 1.885633945465088, + "learning_rate": 9.449192782526117e-06, + "loss": 1.6884, + "step": 3500 + }, + { + "epoch": 14.01, + "grad_norm": 0.19068482518196106, + "learning_rate": 9.444444444444445e-06, + "loss": 0.509, + "step": 3510 + }, + { + "epoch": 14.01, + "eval_accuracy": 0.4, + "eval_loss": 2.746034860610962, + "eval_runtime": 31.4372, + "eval_samples_per_second": 0.795, + "eval_steps_per_second": 0.795, + "step": 3510 + }, + { + "epoch": 15.000427350427351, + "grad_norm": 0.03531758114695549, + "learning_rate": 9.439696106362774e-06, + "loss": 1.7623, + "step": 3520 + }, + { + "epoch": 15.0008547008547, + "grad_norm": 0.2500402331352234, + "learning_rate": 9.434947768281101e-06, + "loss": 0.9918, + "step": 3530 + }, + { + "epoch": 15.001282051282052, + "grad_norm": 6.20103120803833, + "learning_rate": 9.430199430199432e-06, + "loss": 1.3489, + "step": 3540 + }, + { + "epoch": 15.001709401709402, + "grad_norm": 0.275524377822876, + "learning_rate": 9.42545109211776e-06, + "loss": 0.6611, + "step": 3550 + }, + { + "epoch": 15.002136752136753, + "grad_norm": 0.08374079316854477, + "learning_rate": 9.420702754036089e-06, + "loss": 1.2316, + "step": 3560 + }, + { + "epoch": 15.002564102564103, + "grad_norm": 0.5503836274147034, + "learning_rate": 9.415954415954418e-06, + "loss": 0.8269, + "step": 3570 + }, + { + "epoch": 15.002991452991454, + "grad_norm": 46.557186126708984, + "learning_rate": 9.411206077872745e-06, + "loss": 1.2204, + "step": 3580 + }, + { + "epoch": 15.003418803418803, + "grad_norm": 63.70065689086914, + "learning_rate": 9.406457739791074e-06, + "loss": 1.834, + "step": 3590 + }, + { + "epoch": 15.003846153846155, + "grad_norm": 35.3852424621582, + "learning_rate": 9.401709401709402e-06, + "loss": 0.6711, + "step": 3600 + }, + { + "epoch": 15.004273504273504, + "grad_norm": 16.211149215698242, + "learning_rate": 9.396961063627731e-06, + "loss": 0.6182, + "step": 3610 + }, + { + "epoch": 15.004700854700856, + "grad_norm": 76.63227081298828, + "learning_rate": 9.39221272554606e-06, + "loss": 0.831, + "step": 3620 + }, + { + "epoch": 15.005128205128205, + "grad_norm": 0.5662261843681335, + "learning_rate": 9.387464387464388e-06, + "loss": 0.6825, + "step": 3630 + }, + { + "epoch": 15.005555555555556, + "grad_norm": 0.5306532382965088, + "learning_rate": 9.382716049382717e-06, + "loss": 1.213, + "step": 3640 + }, + { + "epoch": 15.005982905982906, + "grad_norm": 74.40409088134766, + "learning_rate": 9.377967711301046e-06, + "loss": 0.5874, + "step": 3650 + }, + { + "epoch": 15.006410256410257, + "grad_norm": 0.02636171504855156, + "learning_rate": 9.373219373219375e-06, + "loss": 0.6467, + "step": 3660 + }, + { + "epoch": 15.006837606837607, + "grad_norm": 0.043768566101789474, + "learning_rate": 9.368471035137702e-06, + "loss": 1.444, + "step": 3670 + }, + { + "epoch": 15.007264957264958, + "grad_norm": 1.168853998184204, + "learning_rate": 9.363722697056032e-06, + "loss": 2.6958, + "step": 3680 + }, + { + "epoch": 15.007692307692308, + "grad_norm": 41.314117431640625, + "learning_rate": 9.358974358974359e-06, + "loss": 1.917, + "step": 3690 + }, + { + "epoch": 15.008119658119659, + "grad_norm": 28.175498962402344, + "learning_rate": 9.354226020892688e-06, + "loss": 1.4498, + "step": 3700 + }, + { + "epoch": 15.008547008547009, + "grad_norm": 0.07334843277931213, + "learning_rate": 9.349477682811017e-06, + "loss": 0.3526, + "step": 3710 + }, + { + "epoch": 15.00897435897436, + "grad_norm": 0.06012887507677078, + "learning_rate": 9.344729344729345e-06, + "loss": 1.0099, + "step": 3720 + }, + { + "epoch": 15.00940170940171, + "grad_norm": 0.030554110184311867, + "learning_rate": 9.339981006647674e-06, + "loss": 0.5927, + "step": 3730 + }, + { + "epoch": 15.00982905982906, + "grad_norm": 0.12558279931545258, + "learning_rate": 9.335232668566003e-06, + "loss": 0.1437, + "step": 3740 + }, + { + "epoch": 15.01, + "eval_accuracy": 0.4, + "eval_loss": 2.969764471054077, + "eval_runtime": 31.3741, + "eval_samples_per_second": 0.797, + "eval_steps_per_second": 0.797, + "step": 3744 + }, + { + "epoch": 16.00025641025641, + "grad_norm": 0.020290929824113846, + "learning_rate": 9.330484330484332e-06, + "loss": 1.4002, + "step": 3750 + }, + { + "epoch": 16.00068376068376, + "grad_norm": 0.02340000309050083, + "learning_rate": 9.32573599240266e-06, + "loss": 1.3384, + "step": 3760 + }, + { + "epoch": 16.00111111111111, + "grad_norm": 0.027016054838895798, + "learning_rate": 9.320987654320989e-06, + "loss": 2.2665, + "step": 3770 + }, + { + "epoch": 16.001538461538463, + "grad_norm": 0.11080523580312729, + "learning_rate": 9.316239316239318e-06, + "loss": 0.7879, + "step": 3780 + }, + { + "epoch": 16.00196581196581, + "grad_norm": 123.3178939819336, + "learning_rate": 9.311490978157645e-06, + "loss": 1.1529, + "step": 3790 + }, + { + "epoch": 16.00239316239316, + "grad_norm": 0.09909241646528244, + "learning_rate": 9.306742640075974e-06, + "loss": 2.4706, + "step": 3800 + }, + { + "epoch": 16.002820512820513, + "grad_norm": 0.025457249954342842, + "learning_rate": 9.301994301994302e-06, + "loss": 0.1416, + "step": 3810 + }, + { + "epoch": 16.003247863247864, + "grad_norm": 0.054812464863061905, + "learning_rate": 9.297245963912631e-06, + "loss": 0.1454, + "step": 3820 + }, + { + "epoch": 16.003675213675212, + "grad_norm": 42.573997497558594, + "learning_rate": 9.29249762583096e-06, + "loss": 2.6915, + "step": 3830 + }, + { + "epoch": 16.004102564102563, + "grad_norm": 0.10259363800287247, + "learning_rate": 9.287749287749288e-06, + "loss": 1.2003, + "step": 3840 + }, + { + "epoch": 16.004529914529915, + "grad_norm": 0.0471661239862442, + "learning_rate": 9.283000949667617e-06, + "loss": 1.2392, + "step": 3850 + }, + { + "epoch": 16.004957264957266, + "grad_norm": 0.0715036392211914, + "learning_rate": 9.278252611585946e-06, + "loss": 0.4382, + "step": 3860 + }, + { + "epoch": 16.005384615384614, + "grad_norm": 0.045701488852500916, + "learning_rate": 9.273504273504275e-06, + "loss": 0.3563, + "step": 3870 + }, + { + "epoch": 16.005811965811965, + "grad_norm": 91.8859634399414, + "learning_rate": 9.268755935422603e-06, + "loss": 1.0391, + "step": 3880 + }, + { + "epoch": 16.006239316239316, + "grad_norm": 2.5818240642547607, + "learning_rate": 9.264007597340932e-06, + "loss": 1.1685, + "step": 3890 + }, + { + "epoch": 16.006666666666668, + "grad_norm": 0.031820375472307205, + "learning_rate": 9.25925925925926e-06, + "loss": 1.1833, + "step": 3900 + }, + { + "epoch": 16.007094017094015, + "grad_norm": 0.07577929645776749, + "learning_rate": 9.254510921177588e-06, + "loss": 2.5019, + "step": 3910 + }, + { + "epoch": 16.007521367521367, + "grad_norm": 0.03320784494280815, + "learning_rate": 9.249762583095917e-06, + "loss": 0.9758, + "step": 3920 + }, + { + "epoch": 16.007948717948718, + "grad_norm": 117.20964813232422, + "learning_rate": 9.245014245014245e-06, + "loss": 0.7272, + "step": 3930 + }, + { + "epoch": 16.00837606837607, + "grad_norm": 12.37725830078125, + "learning_rate": 9.240265906932574e-06, + "loss": 0.4982, + "step": 3940 + }, + { + "epoch": 16.008803418803417, + "grad_norm": 0.03360183909535408, + "learning_rate": 9.235517568850903e-06, + "loss": 1.0855, + "step": 3950 + }, + { + "epoch": 16.00923076923077, + "grad_norm": 209.72621154785156, + "learning_rate": 9.230769230769232e-06, + "loss": 1.1257, + "step": 3960 + }, + { + "epoch": 16.00965811965812, + "grad_norm": 45.13322830200195, + "learning_rate": 9.22602089268756e-06, + "loss": 1.0039, + "step": 3970 + }, + { + "epoch": 16.01, + "eval_accuracy": 0.44, + "eval_loss": 1.9414650201797485, + "eval_runtime": 31.717, + "eval_samples_per_second": 0.788, + "eval_steps_per_second": 0.788, + "step": 3978 + }, + { + "epoch": 17.00008547008547, + "grad_norm": 0.20372648537158966, + "learning_rate": 9.221272554605889e-06, + "loss": 0.989, + "step": 3980 + }, + { + "epoch": 17.00051282051282, + "grad_norm": 143.5639190673828, + "learning_rate": 9.216524216524218e-06, + "loss": 0.5736, + "step": 3990 + }, + { + "epoch": 17.000940170940172, + "grad_norm": 0.0925712063908577, + "learning_rate": 9.211775878442545e-06, + "loss": 0.7445, + "step": 4000 + }, + { + "epoch": 17.00136752136752, + "grad_norm": 0.03829040750861168, + "learning_rate": 9.207027540360875e-06, + "loss": 0.5319, + "step": 4010 + }, + { + "epoch": 17.00179487179487, + "grad_norm": 66.21209716796875, + "learning_rate": 9.202279202279202e-06, + "loss": 1.2316, + "step": 4020 + }, + { + "epoch": 17.002222222222223, + "grad_norm": 0.07042970508337021, + "learning_rate": 9.197530864197531e-06, + "loss": 1.7251, + "step": 4030 + }, + { + "epoch": 17.002649572649574, + "grad_norm": 1.4991275072097778, + "learning_rate": 9.19278252611586e-06, + "loss": 0.9513, + "step": 4040 + }, + { + "epoch": 17.00307692307692, + "grad_norm": 0.5810855627059937, + "learning_rate": 9.188034188034188e-06, + "loss": 2.5822, + "step": 4050 + }, + { + "epoch": 17.003504273504273, + "grad_norm": 0.08620700240135193, + "learning_rate": 9.183285849952517e-06, + "loss": 1.3987, + "step": 4060 + }, + { + "epoch": 17.003931623931624, + "grad_norm": 35.556392669677734, + "learning_rate": 9.178537511870846e-06, + "loss": 1.2302, + "step": 4070 + }, + { + "epoch": 17.004358974358976, + "grad_norm": 33.437049865722656, + "learning_rate": 9.173789173789175e-06, + "loss": 1.1155, + "step": 4080 + }, + { + "epoch": 17.004786324786323, + "grad_norm": 0.6656836867332458, + "learning_rate": 9.169040835707503e-06, + "loss": 1.1672, + "step": 4090 + }, + { + "epoch": 17.005213675213675, + "grad_norm": 0.0775466188788414, + "learning_rate": 9.164292497625832e-06, + "loss": 0.9781, + "step": 4100 + }, + { + "epoch": 17.005641025641026, + "grad_norm": 177.47276306152344, + "learning_rate": 9.159544159544161e-06, + "loss": 1.6938, + "step": 4110 + }, + { + "epoch": 17.006068376068377, + "grad_norm": 0.02730586938560009, + "learning_rate": 9.154795821462488e-06, + "loss": 1.2777, + "step": 4120 + }, + { + "epoch": 17.006495726495725, + "grad_norm": 76.6297607421875, + "learning_rate": 9.150047483380818e-06, + "loss": 0.6365, + "step": 4130 + }, + { + "epoch": 17.006923076923076, + "grad_norm": 1.5181001424789429, + "learning_rate": 9.145299145299145e-06, + "loss": 0.5115, + "step": 4140 + }, + { + "epoch": 17.007350427350428, + "grad_norm": 7.319108486175537, + "learning_rate": 9.140550807217474e-06, + "loss": 1.4807, + "step": 4150 + }, + { + "epoch": 17.00777777777778, + "grad_norm": 139.0987548828125, + "learning_rate": 9.135802469135803e-06, + "loss": 1.5217, + "step": 4160 + }, + { + "epoch": 17.008205128205127, + "grad_norm": 16.966917037963867, + "learning_rate": 9.131054131054132e-06, + "loss": 2.0712, + "step": 4170 + }, + { + "epoch": 17.008632478632478, + "grad_norm": 73.7410659790039, + "learning_rate": 9.126305792972462e-06, + "loss": 1.0309, + "step": 4180 + }, + { + "epoch": 17.00905982905983, + "grad_norm": 0.03598838672041893, + "learning_rate": 9.121557454890789e-06, + "loss": 0.3113, + "step": 4190 + }, + { + "epoch": 17.00948717948718, + "grad_norm": 0.01311536505818367, + "learning_rate": 9.116809116809118e-06, + "loss": 1.8983, + "step": 4200 + }, + { + "epoch": 17.00991452991453, + "grad_norm": 0.07452392578125, + "learning_rate": 9.112060778727446e-06, + "loss": 0.0062, + "step": 4210 + }, + { + "epoch": 17.01, + "eval_accuracy": 0.4, + "eval_loss": 3.7040703296661377, + "eval_runtime": 31.9027, + "eval_samples_per_second": 0.784, + "eval_steps_per_second": 0.784, + "step": 4212 + }, + { + "epoch": 18.00034188034188, + "grad_norm": 107.44599151611328, + "learning_rate": 9.107312440645775e-06, + "loss": 1.8731, + "step": 4220 + }, + { + "epoch": 18.00076923076923, + "grad_norm": 0.23524224758148193, + "learning_rate": 9.102564102564104e-06, + "loss": 1.0308, + "step": 4230 + }, + { + "epoch": 18.00119658119658, + "grad_norm": 0.07270976901054382, + "learning_rate": 9.097815764482431e-06, + "loss": 1.2283, + "step": 4240 + }, + { + "epoch": 18.001623931623932, + "grad_norm": 6.627790451049805, + "learning_rate": 9.09306742640076e-06, + "loss": 1.0094, + "step": 4250 + }, + { + "epoch": 18.002051282051283, + "grad_norm": 0.02103256992995739, + "learning_rate": 9.088319088319088e-06, + "loss": 0.5875, + "step": 4260 + }, + { + "epoch": 18.00247863247863, + "grad_norm": 0.6851010322570801, + "learning_rate": 9.083570750237419e-06, + "loss": 1.2334, + "step": 4270 + }, + { + "epoch": 18.002905982905983, + "grad_norm": 82.27792358398438, + "learning_rate": 9.078822412155746e-06, + "loss": 1.9545, + "step": 4280 + }, + { + "epoch": 18.003333333333334, + "grad_norm": 139.23216247558594, + "learning_rate": 9.074074074074075e-06, + "loss": 0.8405, + "step": 4290 + }, + { + "epoch": 18.003760683760685, + "grad_norm": 0.8755741715431213, + "learning_rate": 9.069325735992403e-06, + "loss": 1.1443, + "step": 4300 + }, + { + "epoch": 18.004188034188033, + "grad_norm": 105.50877380371094, + "learning_rate": 9.064577397910732e-06, + "loss": 0.821, + "step": 4310 + }, + { + "epoch": 18.004615384615384, + "grad_norm": 114.59051513671875, + "learning_rate": 9.059829059829061e-06, + "loss": 1.6665, + "step": 4320 + }, + { + "epoch": 18.005042735042736, + "grad_norm": 0.04322560876607895, + "learning_rate": 9.055080721747389e-06, + "loss": 0.9944, + "step": 4330 + }, + { + "epoch": 18.005470085470087, + "grad_norm": 16.47235870361328, + "learning_rate": 9.050332383665718e-06, + "loss": 0.6319, + "step": 4340 + }, + { + "epoch": 18.005897435897435, + "grad_norm": 5.790500640869141, + "learning_rate": 9.045584045584045e-06, + "loss": 1.4753, + "step": 4350 + }, + { + "epoch": 18.006324786324786, + "grad_norm": 0.045558616518974304, + "learning_rate": 9.040835707502374e-06, + "loss": 1.4482, + "step": 4360 + }, + { + "epoch": 18.006752136752137, + "grad_norm": 0.014822369441390038, + "learning_rate": 9.036087369420703e-06, + "loss": 1.1214, + "step": 4370 + }, + { + "epoch": 18.00717948717949, + "grad_norm": 90.13900756835938, + "learning_rate": 9.031339031339033e-06, + "loss": 2.0725, + "step": 4380 + }, + { + "epoch": 18.007606837606836, + "grad_norm": 80.13975524902344, + "learning_rate": 9.026590693257362e-06, + "loss": 1.1801, + "step": 4390 + }, + { + "epoch": 18.008034188034188, + "grad_norm": 0.009048054926097393, + "learning_rate": 9.02184235517569e-06, + "loss": 0.6618, + "step": 4400 + }, + { + "epoch": 18.00846153846154, + "grad_norm": 59.612308502197266, + "learning_rate": 9.017094017094018e-06, + "loss": 1.7319, + "step": 4410 + }, + { + "epoch": 18.00888888888889, + "grad_norm": 0.1411258578300476, + "learning_rate": 9.012345679012346e-06, + "loss": 0.9388, + "step": 4420 + }, + { + "epoch": 18.009316239316238, + "grad_norm": 12.921591758728027, + "learning_rate": 9.007597340930675e-06, + "loss": 0.6901, + "step": 4430 + }, + { + "epoch": 18.00974358974359, + "grad_norm": 25.335466384887695, + "learning_rate": 9.002849002849004e-06, + "loss": 0.6038, + "step": 4440 + }, + { + "epoch": 18.01, + "eval_accuracy": 0.4, + "eval_loss": 3.2141363620758057, + "eval_runtime": 31.3241, + "eval_samples_per_second": 0.798, + "eval_steps_per_second": 0.798, + "step": 4446 + }, + { + "epoch": 19.00017094017094, + "grad_norm": 119.39568328857422, + "learning_rate": 8.998100664767332e-06, + "loss": 1.4807, + "step": 4450 + }, + { + "epoch": 19.00059829059829, + "grad_norm": 86.97232055664062, + "learning_rate": 8.99335232668566e-06, + "loss": 1.6098, + "step": 4460 + }, + { + "epoch": 19.00102564102564, + "grad_norm": 69.82275390625, + "learning_rate": 8.988603988603988e-06, + "loss": 0.7945, + "step": 4470 + }, + { + "epoch": 19.001452991452993, + "grad_norm": 0.024586718529462814, + "learning_rate": 8.983855650522319e-06, + "loss": 0.289, + "step": 4480 + }, + { + "epoch": 19.00188034188034, + "grad_norm": 0.08284483104944229, + "learning_rate": 8.979107312440646e-06, + "loss": 0.7986, + "step": 4490 + }, + { + "epoch": 19.002307692307692, + "grad_norm": 31.52510643005371, + "learning_rate": 8.974358974358976e-06, + "loss": 0.337, + "step": 4500 + }, + { + "epoch": 19.002735042735043, + "grad_norm": 0.04926073178648949, + "learning_rate": 8.969610636277305e-06, + "loss": 1.7346, + "step": 4510 + }, + { + "epoch": 19.003162393162395, + "grad_norm": 0.06321626901626587, + "learning_rate": 8.964862298195632e-06, + "loss": 0.9723, + "step": 4520 + }, + { + "epoch": 19.003589743589743, + "grad_norm": 0.09472198784351349, + "learning_rate": 8.960113960113961e-06, + "loss": 0.0057, + "step": 4530 + }, + { + "epoch": 19.004017094017094, + "grad_norm": 145.72950744628906, + "learning_rate": 8.955365622032289e-06, + "loss": 0.7365, + "step": 4540 + }, + { + "epoch": 19.004444444444445, + "grad_norm": 0.012398917227983475, + "learning_rate": 8.950617283950618e-06, + "loss": 2.1059, + "step": 4550 + }, + { + "epoch": 19.004871794871796, + "grad_norm": 73.18333435058594, + "learning_rate": 8.945868945868947e-06, + "loss": 1.8259, + "step": 4560 + }, + { + "epoch": 19.005299145299144, + "grad_norm": 84.90556335449219, + "learning_rate": 8.941120607787274e-06, + "loss": 0.696, + "step": 4570 + }, + { + "epoch": 19.005726495726496, + "grad_norm": 0.024428799748420715, + "learning_rate": 8.936372269705604e-06, + "loss": 1.0575, + "step": 4580 + }, + { + "epoch": 19.006153846153847, + "grad_norm": 44.86989974975586, + "learning_rate": 8.931623931623933e-06, + "loss": 1.0172, + "step": 4590 + }, + { + "epoch": 19.006581196581198, + "grad_norm": 0.16147296130657196, + "learning_rate": 8.926875593542262e-06, + "loss": 0.4637, + "step": 4600 + }, + { + "epoch": 19.007008547008546, + "grad_norm": 0.016476508229970932, + "learning_rate": 8.92212725546059e-06, + "loss": 0.0585, + "step": 4610 + }, + { + "epoch": 19.007435897435897, + "grad_norm": 69.78681945800781, + "learning_rate": 8.917378917378919e-06, + "loss": 2.8439, + "step": 4620 + }, + { + "epoch": 19.00786324786325, + "grad_norm": 143.7225799560547, + "learning_rate": 8.912630579297246e-06, + "loss": 0.3701, + "step": 4630 + }, + { + "epoch": 19.0082905982906, + "grad_norm": 3.918398141860962, + "learning_rate": 8.907882241215575e-06, + "loss": 0.6645, + "step": 4640 + }, + { + "epoch": 19.008717948717948, + "grad_norm": 0.4194408357143402, + "learning_rate": 8.903133903133904e-06, + "loss": 1.5778, + "step": 4650 + }, + { + "epoch": 19.0091452991453, + "grad_norm": 0.7064778804779053, + "learning_rate": 8.898385565052232e-06, + "loss": 0.9134, + "step": 4660 + }, + { + "epoch": 19.00957264957265, + "grad_norm": 97.42158508300781, + "learning_rate": 8.893637226970561e-06, + "loss": 0.6072, + "step": 4670 + }, + { + "epoch": 19.01, + "grad_norm": 0.028133299201726913, + "learning_rate": 8.888888888888888e-06, + "loss": 1.1687, + "step": 4680 + }, + { + "epoch": 19.01, + "eval_accuracy": 0.44, + "eval_loss": 2.407238721847534, + "eval_runtime": 31.5073, + "eval_samples_per_second": 0.793, + "eval_steps_per_second": 0.793, + "step": 4680 + }, + { + "epoch": 20.00042735042735, + "grad_norm": 0.0467204824090004, + "learning_rate": 8.884140550807219e-06, + "loss": 1.0038, + "step": 4690 + }, + { + "epoch": 20.000854700854703, + "grad_norm": 0.044175442308187485, + "learning_rate": 8.879392212725547e-06, + "loss": 0.2621, + "step": 4700 + }, + { + "epoch": 20.00128205128205, + "grad_norm": 0.12440741062164307, + "learning_rate": 8.874643874643876e-06, + "loss": 1.2026, + "step": 4710 + }, + { + "epoch": 20.0017094017094, + "grad_norm": 0.011806718073785305, + "learning_rate": 8.869895536562205e-06, + "loss": 0.7563, + "step": 4720 + }, + { + "epoch": 20.002136752136753, + "grad_norm": 1.0385252237319946, + "learning_rate": 8.865147198480532e-06, + "loss": 0.5996, + "step": 4730 + }, + { + "epoch": 20.002564102564104, + "grad_norm": 175.9308319091797, + "learning_rate": 8.860398860398861e-06, + "loss": 0.8629, + "step": 4740 + }, + { + "epoch": 20.002991452991452, + "grad_norm": 0.028463028371334076, + "learning_rate": 8.855650522317189e-06, + "loss": 2.8217, + "step": 4750 + }, + { + "epoch": 20.003418803418803, + "grad_norm": 0.030408434569835663, + "learning_rate": 8.850902184235518e-06, + "loss": 0.6615, + "step": 4760 + }, + { + "epoch": 20.003846153846155, + "grad_norm": 0.057413261383771896, + "learning_rate": 8.846153846153847e-06, + "loss": 0.5805, + "step": 4770 + }, + { + "epoch": 20.004273504273506, + "grad_norm": 73.27574920654297, + "learning_rate": 8.841405508072175e-06, + "loss": 1.1753, + "step": 4780 + }, + { + "epoch": 20.004700854700854, + "grad_norm": 0.06163284182548523, + "learning_rate": 8.836657169990504e-06, + "loss": 0.4274, + "step": 4790 + }, + { + "epoch": 20.005128205128205, + "grad_norm": 23.880149841308594, + "learning_rate": 8.831908831908833e-06, + "loss": 0.6001, + "step": 4800 + }, + { + "epoch": 20.005555555555556, + "grad_norm": 81.80984497070312, + "learning_rate": 8.827160493827162e-06, + "loss": 0.7434, + "step": 4810 + }, + { + "epoch": 20.005982905982908, + "grad_norm": 0.018306629732251167, + "learning_rate": 8.82241215574549e-06, + "loss": 0.7265, + "step": 4820 + }, + { + "epoch": 20.006410256410255, + "grad_norm": 2.1246113777160645, + "learning_rate": 8.817663817663819e-06, + "loss": 1.1732, + "step": 4830 + }, + { + "epoch": 20.006837606837607, + "grad_norm": 0.1039893627166748, + "learning_rate": 8.812915479582148e-06, + "loss": 1.2742, + "step": 4840 + }, + { + "epoch": 20.007264957264958, + "grad_norm": 0.3535962402820587, + "learning_rate": 8.808167141500475e-06, + "loss": 0.4528, + "step": 4850 + }, + { + "epoch": 20.00769230769231, + "grad_norm": 177.94131469726562, + "learning_rate": 8.803418803418804e-06, + "loss": 0.4491, + "step": 4860 + }, + { + "epoch": 20.008119658119657, + "grad_norm": 0.01753688044846058, + "learning_rate": 8.798670465337132e-06, + "loss": 1.9356, + "step": 4870 + }, + { + "epoch": 20.00854700854701, + "grad_norm": 5.777415752410889, + "learning_rate": 8.793922127255461e-06, + "loss": 0.7365, + "step": 4880 + }, + { + "epoch": 20.00897435897436, + "grad_norm": 0.021493086591362953, + "learning_rate": 8.78917378917379e-06, + "loss": 1.5335, + "step": 4890 + }, + { + "epoch": 20.00940170940171, + "grad_norm": 56.53275680541992, + "learning_rate": 8.78442545109212e-06, + "loss": 2.5917, + "step": 4900 + }, + { + "epoch": 20.00982905982906, + "grad_norm": 0.8556663393974304, + "learning_rate": 8.779677113010447e-06, + "loss": 0.8397, + "step": 4910 + }, + { + "epoch": 20.01, + "eval_accuracy": 0.4, + "eval_loss": 3.42124080657959, + "eval_runtime": 32.6346, + "eval_samples_per_second": 0.766, + "eval_steps_per_second": 0.766, + "step": 4914 + }, + { + "epoch": 21.00025641025641, + "grad_norm": 0.02766931988298893, + "learning_rate": 8.774928774928776e-06, + "loss": 1.0177, + "step": 4920 + }, + { + "epoch": 21.00068376068376, + "grad_norm": 1.8544671535491943, + "learning_rate": 8.770180436847105e-06, + "loss": 0.6464, + "step": 4930 + }, + { + "epoch": 21.00111111111111, + "grad_norm": 64.42668914794922, + "learning_rate": 8.765432098765432e-06, + "loss": 2.2753, + "step": 4940 + }, + { + "epoch": 21.001538461538463, + "grad_norm": 5.441823959350586, + "learning_rate": 8.760683760683762e-06, + "loss": 0.3229, + "step": 4950 + }, + { + "epoch": 21.00196581196581, + "grad_norm": 0.053660500794649124, + "learning_rate": 8.755935422602089e-06, + "loss": 0.4219, + "step": 4960 + }, + { + "epoch": 21.00239316239316, + "grad_norm": 0.015398544259369373, + "learning_rate": 8.751187084520418e-06, + "loss": 0.0334, + "step": 4970 + }, + { + "epoch": 21.002820512820513, + "grad_norm": 27.52494239807129, + "learning_rate": 8.746438746438747e-06, + "loss": 1.3975, + "step": 4980 + }, + { + "epoch": 21.003247863247864, + "grad_norm": 0.019361304119229317, + "learning_rate": 8.741690408357075e-06, + "loss": 1.1835, + "step": 4990 + }, + { + "epoch": 21.003675213675212, + "grad_norm": 0.15508921444416046, + "learning_rate": 8.736942070275406e-06, + "loss": 0.9132, + "step": 5000 + }, + { + "epoch": 21.004102564102563, + "grad_norm": 394.57867431640625, + "learning_rate": 8.732193732193733e-06, + "loss": 0.8003, + "step": 5010 + }, + { + "epoch": 21.004529914529915, + "grad_norm": 0.3072645962238312, + "learning_rate": 8.727445394112062e-06, + "loss": 1.085, + "step": 5020 + }, + { + "epoch": 21.004957264957266, + "grad_norm": 128.3138427734375, + "learning_rate": 8.72269705603039e-06, + "loss": 1.6623, + "step": 5030 + }, + { + "epoch": 21.005384615384614, + "grad_norm": 0.40651294589042664, + "learning_rate": 8.717948717948719e-06, + "loss": 1.3648, + "step": 5040 + }, + { + "epoch": 21.005811965811965, + "grad_norm": 51.56714630126953, + "learning_rate": 8.713200379867048e-06, + "loss": 1.1869, + "step": 5050 + }, + { + "epoch": 21.006239316239316, + "grad_norm": 167.30929565429688, + "learning_rate": 8.708452041785375e-06, + "loss": 1.2651, + "step": 5060 + }, + { + "epoch": 21.006666666666668, + "grad_norm": 26.427907943725586, + "learning_rate": 8.703703703703705e-06, + "loss": 1.136, + "step": 5070 + }, + { + "epoch": 21.007094017094015, + "grad_norm": 0.03196699172258377, + "learning_rate": 8.698955365622032e-06, + "loss": 0.5952, + "step": 5080 + }, + { + "epoch": 21.007521367521367, + "grad_norm": 131.19776916503906, + "learning_rate": 8.694207027540361e-06, + "loss": 1.0937, + "step": 5090 + }, + { + "epoch": 21.007948717948718, + "grad_norm": 0.009517078287899494, + "learning_rate": 8.68945868945869e-06, + "loss": 0.6601, + "step": 5100 + }, + { + "epoch": 21.00837606837607, + "grad_norm": 106.69688415527344, + "learning_rate": 8.68471035137702e-06, + "loss": 1.0542, + "step": 5110 + }, + { + "epoch": 21.008803418803417, + "grad_norm": 0.006986229680478573, + "learning_rate": 8.679962013295347e-06, + "loss": 1.1394, + "step": 5120 + }, + { + "epoch": 21.00923076923077, + "grad_norm": 127.98587799072266, + "learning_rate": 8.675213675213676e-06, + "loss": 2.0223, + "step": 5130 + }, + { + "epoch": 21.00965811965812, + "grad_norm": 113.3585205078125, + "learning_rate": 8.670465337132005e-06, + "loss": 1.1147, + "step": 5140 + }, + { + "epoch": 21.01, + "eval_accuracy": 0.44, + "eval_loss": 2.511547565460205, + "eval_runtime": 33.474, + "eval_samples_per_second": 0.747, + "eval_steps_per_second": 0.747, + "step": 5148 + }, + { + "epoch": 22.00008547008547, + "grad_norm": 0.017451312392950058, + "learning_rate": 8.665716999050333e-06, + "loss": 0.4376, + "step": 5150 + }, + { + "epoch": 22.00051282051282, + "grad_norm": 0.21703580021858215, + "learning_rate": 8.660968660968662e-06, + "loss": 0.8807, + "step": 5160 + }, + { + "epoch": 22.000940170940172, + "grad_norm": 0.01205768994987011, + "learning_rate": 8.656220322886991e-06, + "loss": 0.4462, + "step": 5170 + }, + { + "epoch": 22.00136752136752, + "grad_norm": 62.46580123901367, + "learning_rate": 8.651471984805318e-06, + "loss": 0.6433, + "step": 5180 + }, + { + "epoch": 22.00179487179487, + "grad_norm": 195.9849853515625, + "learning_rate": 8.646723646723648e-06, + "loss": 0.6842, + "step": 5190 + }, + { + "epoch": 22.002222222222223, + "grad_norm": 23.76949691772461, + "learning_rate": 8.641975308641975e-06, + "loss": 1.8101, + "step": 5200 + }, + { + "epoch": 22.002649572649574, + "grad_norm": 1.1113344430923462, + "learning_rate": 8.637226970560306e-06, + "loss": 1.3208, + "step": 5210 + }, + { + "epoch": 22.00307692307692, + "grad_norm": 50.05499267578125, + "learning_rate": 8.632478632478633e-06, + "loss": 1.7615, + "step": 5220 + }, + { + "epoch": 22.003504273504273, + "grad_norm": 148.08843994140625, + "learning_rate": 8.627730294396962e-06, + "loss": 0.5733, + "step": 5230 + }, + { + "epoch": 22.003931623931624, + "grad_norm": 22.12079429626465, + "learning_rate": 8.62298195631529e-06, + "loss": 0.399, + "step": 5240 + }, + { + "epoch": 22.004358974358976, + "grad_norm": 61.95510482788086, + "learning_rate": 8.618233618233619e-06, + "loss": 0.8791, + "step": 5250 + }, + { + "epoch": 22.004786324786323, + "grad_norm": 0.0063286456279456615, + "learning_rate": 8.613485280151948e-06, + "loss": 0.9967, + "step": 5260 + }, + { + "epoch": 22.005213675213675, + "grad_norm": 0.005704652518033981, + "learning_rate": 8.608736942070276e-06, + "loss": 0.0331, + "step": 5270 + }, + { + "epoch": 22.005641025641026, + "grad_norm": 0.019357306882739067, + "learning_rate": 8.603988603988605e-06, + "loss": 1.0326, + "step": 5280 + }, + { + "epoch": 22.006068376068377, + "grad_norm": 38.796791076660156, + "learning_rate": 8.599240265906932e-06, + "loss": 0.3044, + "step": 5290 + }, + { + "epoch": 22.006495726495725, + "grad_norm": 0.023437276482582092, + "learning_rate": 8.594491927825261e-06, + "loss": 1.0089, + "step": 5300 + }, + { + "epoch": 22.006923076923076, + "grad_norm": 91.38776397705078, + "learning_rate": 8.58974358974359e-06, + "loss": 1.7264, + "step": 5310 + }, + { + "epoch": 22.007350427350428, + "grad_norm": 0.006801048293709755, + "learning_rate": 8.58499525166192e-06, + "loss": 0.2886, + "step": 5320 + }, + { + "epoch": 22.00777777777778, + "grad_norm": 0.005525406915694475, + "learning_rate": 8.580246913580249e-06, + "loss": 0.6458, + "step": 5330 + }, + { + "epoch": 22.008205128205127, + "grad_norm": 396.8755187988281, + "learning_rate": 8.575498575498576e-06, + "loss": 0.6596, + "step": 5340 + }, + { + "epoch": 22.008632478632478, + "grad_norm": 0.45548421144485474, + "learning_rate": 8.570750237416905e-06, + "loss": 0.8506, + "step": 5350 + }, + { + "epoch": 22.00905982905983, + "grad_norm": 0.06096582114696503, + "learning_rate": 8.566001899335233e-06, + "loss": 0.9077, + "step": 5360 + }, + { + "epoch": 22.00948717948718, + "grad_norm": 52.80357360839844, + "learning_rate": 8.561253561253562e-06, + "loss": 1.4106, + "step": 5370 + }, + { + "epoch": 22.00991452991453, + "grad_norm": 0.40176790952682495, + "learning_rate": 8.556505223171891e-06, + "loss": 0.2286, + "step": 5380 + }, + { + "epoch": 22.01, + "eval_accuracy": 0.44, + "eval_loss": 2.434278964996338, + "eval_runtime": 33.4186, + "eval_samples_per_second": 0.748, + "eval_steps_per_second": 0.748, + "step": 5382 + }, + { + "epoch": 23.00034188034188, + "grad_norm": 0.008083014748990536, + "learning_rate": 8.551756885090219e-06, + "loss": 0.2302, + "step": 5390 + }, + { + "epoch": 23.00076923076923, + "grad_norm": 81.26300048828125, + "learning_rate": 8.547008547008548e-06, + "loss": 0.586, + "step": 5400 + }, + { + "epoch": 23.00119658119658, + "grad_norm": 0.00498996814712882, + "learning_rate": 8.542260208926875e-06, + "loss": 0.3741, + "step": 5410 + }, + { + "epoch": 23.001623931623932, + "grad_norm": 0.028612615540623665, + "learning_rate": 8.537511870845206e-06, + "loss": 0.4084, + "step": 5420 + }, + { + "epoch": 23.002051282051283, + "grad_norm": 0.005741656292229891, + "learning_rate": 8.532763532763533e-06, + "loss": 1.1549, + "step": 5430 + }, + { + "epoch": 23.00247863247863, + "grad_norm": 0.0038078909274190664, + "learning_rate": 8.528015194681863e-06, + "loss": 0.5803, + "step": 5440 + }, + { + "epoch": 23.002905982905983, + "grad_norm": 0.07777408510446548, + "learning_rate": 8.52326685660019e-06, + "loss": 1.8444, + "step": 5450 + }, + { + "epoch": 23.003333333333334, + "grad_norm": 0.23302805423736572, + "learning_rate": 8.518518518518519e-06, + "loss": 0.3736, + "step": 5460 + }, + { + "epoch": 23.003760683760685, + "grad_norm": 0.03206072375178337, + "learning_rate": 8.513770180436848e-06, + "loss": 0.6338, + "step": 5470 + }, + { + "epoch": 23.004188034188033, + "grad_norm": 55.12649154663086, + "learning_rate": 8.509021842355176e-06, + "loss": 0.9267, + "step": 5480 + }, + { + "epoch": 23.004615384615384, + "grad_norm": 68.18521881103516, + "learning_rate": 8.504273504273505e-06, + "loss": 0.8771, + "step": 5490 + }, + { + "epoch": 23.005042735042736, + "grad_norm": 28.95956039428711, + "learning_rate": 8.499525166191834e-06, + "loss": 0.8377, + "step": 5500 + }, + { + "epoch": 23.005470085470087, + "grad_norm": 0.004457338713109493, + "learning_rate": 8.494776828110161e-06, + "loss": 0.0553, + "step": 5510 + }, + { + "epoch": 23.005897435897435, + "grad_norm": 0.024025579914450645, + "learning_rate": 8.49002849002849e-06, + "loss": 0.8331, + "step": 5520 + }, + { + "epoch": 23.006324786324786, + "grad_norm": 0.019380344077944756, + "learning_rate": 8.48528015194682e-06, + "loss": 1.7575, + "step": 5530 + }, + { + "epoch": 23.006752136752137, + "grad_norm": 0.015158622525632381, + "learning_rate": 8.480531813865149e-06, + "loss": 1.1634, + "step": 5540 + }, + { + "epoch": 23.00717948717949, + "grad_norm": 27.653972625732422, + "learning_rate": 8.475783475783476e-06, + "loss": 1.3936, + "step": 5550 + }, + { + "epoch": 23.007606837606836, + "grad_norm": 197.41244506835938, + "learning_rate": 8.471035137701806e-06, + "loss": 0.9821, + "step": 5560 + }, + { + "epoch": 23.008034188034188, + "grad_norm": 0.009685487486422062, + "learning_rate": 8.466286799620133e-06, + "loss": 0.048, + "step": 5570 + }, + { + "epoch": 23.00846153846154, + "grad_norm": 0.04525621980428696, + "learning_rate": 8.461538461538462e-06, + "loss": 1.1907, + "step": 5580 + }, + { + "epoch": 23.00888888888889, + "grad_norm": 0.29436931014060974, + "learning_rate": 8.456790123456791e-06, + "loss": 2.1125, + "step": 5590 + }, + { + "epoch": 23.009316239316238, + "grad_norm": 0.028307698667049408, + "learning_rate": 8.452041785375119e-06, + "loss": 0.6447, + "step": 5600 + }, + { + "epoch": 23.00974358974359, + "grad_norm": 171.65646362304688, + "learning_rate": 8.447293447293448e-06, + "loss": 0.8939, + "step": 5610 + }, + { + "epoch": 23.01, + "eval_accuracy": 0.4, + "eval_loss": 3.0712430477142334, + "eval_runtime": 33.4249, + "eval_samples_per_second": 0.748, + "eval_steps_per_second": 0.748, + "step": 5616 + }, + { + "epoch": 24.00017094017094, + "grad_norm": 0.5712462067604065, + "learning_rate": 8.442545109211775e-06, + "loss": 0.2082, + "step": 5620 + }, + { + "epoch": 24.00059829059829, + "grad_norm": 0.008292453363537788, + "learning_rate": 8.437796771130106e-06, + "loss": 0.6958, + "step": 5630 + }, + { + "epoch": 24.00102564102564, + "grad_norm": 118.00381469726562, + "learning_rate": 8.433048433048434e-06, + "loss": 0.9527, + "step": 5640 + }, + { + "epoch": 24.001452991452993, + "grad_norm": 1.618455171585083, + "learning_rate": 8.428300094966763e-06, + "loss": 0.3995, + "step": 5650 + }, + { + "epoch": 24.00188034188034, + "grad_norm": 131.17752075195312, + "learning_rate": 8.423551756885092e-06, + "loss": 0.3038, + "step": 5660 + }, + { + "epoch": 24.002307692307692, + "grad_norm": 2.7916219234466553, + "learning_rate": 8.41880341880342e-06, + "loss": 0.7865, + "step": 5670 + }, + { + "epoch": 24.002735042735043, + "grad_norm": 37.944374084472656, + "learning_rate": 8.414055080721748e-06, + "loss": 0.1766, + "step": 5680 + }, + { + "epoch": 24.003162393162395, + "grad_norm": 0.038069456815719604, + "learning_rate": 8.409306742640076e-06, + "loss": 0.3233, + "step": 5690 + }, + { + "epoch": 24.003589743589743, + "grad_norm": 0.01322519313544035, + "learning_rate": 8.404558404558405e-06, + "loss": 0.2382, + "step": 5700 + }, + { + "epoch": 24.004017094017094, + "grad_norm": 181.4189910888672, + "learning_rate": 8.399810066476734e-06, + "loss": 0.9649, + "step": 5710 + }, + { + "epoch": 24.004444444444445, + "grad_norm": 37.26016616821289, + "learning_rate": 8.395061728395062e-06, + "loss": 1.6616, + "step": 5720 + }, + { + "epoch": 24.004871794871796, + "grad_norm": 0.007100946735590696, + "learning_rate": 8.39031339031339e-06, + "loss": 1.193, + "step": 5730 + }, + { + "epoch": 24.005299145299144, + "grad_norm": 2.50199031829834, + "learning_rate": 8.38556505223172e-06, + "loss": 1.475, + "step": 5740 + }, + { + "epoch": 24.005726495726496, + "grad_norm": 0.008331292308866978, + "learning_rate": 8.380816714150049e-06, + "loss": 0.2329, + "step": 5750 + }, + { + "epoch": 24.006153846153847, + "grad_norm": 0.04707374423742294, + "learning_rate": 8.376068376068377e-06, + "loss": 0.363, + "step": 5760 + }, + { + "epoch": 24.006581196581198, + "grad_norm": 0.08683554083108902, + "learning_rate": 8.371320037986706e-06, + "loss": 1.9591, + "step": 5770 + }, + { + "epoch": 24.007008547008546, + "grad_norm": 0.020473746582865715, + "learning_rate": 8.366571699905033e-06, + "loss": 1.562, + "step": 5780 + }, + { + "epoch": 24.007435897435897, + "grad_norm": 48.53575134277344, + "learning_rate": 8.361823361823362e-06, + "loss": 0.5132, + "step": 5790 + }, + { + "epoch": 24.00786324786325, + "grad_norm": 0.007784120738506317, + "learning_rate": 8.357075023741691e-06, + "loss": 1.1691, + "step": 5800 + }, + { + "epoch": 24.0082905982906, + "grad_norm": 0.08522366732358932, + "learning_rate": 8.352326685660019e-06, + "loss": 0.0145, + "step": 5810 + }, + { + "epoch": 24.008717948717948, + "grad_norm": 3.588449001312256, + "learning_rate": 8.347578347578348e-06, + "loss": 0.2557, + "step": 5820 + }, + { + "epoch": 24.0091452991453, + "grad_norm": 6.431326389312744, + "learning_rate": 8.342830009496677e-06, + "loss": 0.8734, + "step": 5830 + }, + { + "epoch": 24.00957264957265, + "grad_norm": 10.404790878295898, + "learning_rate": 8.338081671415006e-06, + "loss": 1.3251, + "step": 5840 + }, + { + "epoch": 24.01, + "grad_norm": 0.02091125212609768, + "learning_rate": 8.333333333333334e-06, + "loss": 0.3871, + "step": 5850 + }, + { + "epoch": 24.01, + "eval_accuracy": 0.4, + "eval_loss": 3.2393715381622314, + "eval_runtime": 33.9657, + "eval_samples_per_second": 0.736, + "eval_steps_per_second": 0.736, + "step": 5850 + }, + { + "epoch": 25.00042735042735, + "grad_norm": 134.0677947998047, + "learning_rate": 8.328584995251663e-06, + "loss": 0.6986, + "step": 5860 + }, + { + "epoch": 25.000854700854703, + "grad_norm": 0.03229491412639618, + "learning_rate": 8.323836657169992e-06, + "loss": 0.768, + "step": 5870 + }, + { + "epoch": 25.00128205128205, + "grad_norm": 156.05152893066406, + "learning_rate": 8.31908831908832e-06, + "loss": 1.0463, + "step": 5880 + }, + { + "epoch": 25.0017094017094, + "grad_norm": 0.014203476719558239, + "learning_rate": 8.314339981006649e-06, + "loss": 0.488, + "step": 5890 + }, + { + "epoch": 25.002136752136753, + "grad_norm": 285.82000732421875, + "learning_rate": 8.309591642924976e-06, + "loss": 0.8084, + "step": 5900 + }, + { + "epoch": 25.002564102564104, + "grad_norm": 122.90914154052734, + "learning_rate": 8.304843304843305e-06, + "loss": 0.8786, + "step": 5910 + }, + { + "epoch": 25.002991452991452, + "grad_norm": 8.224214553833008, + "learning_rate": 8.300094966761634e-06, + "loss": 0.3977, + "step": 5920 + }, + { + "epoch": 25.003418803418803, + "grad_norm": 230.34385681152344, + "learning_rate": 8.295346628679962e-06, + "loss": 0.5285, + "step": 5930 + }, + { + "epoch": 25.003846153846155, + "grad_norm": 1.262283205986023, + "learning_rate": 8.290598290598293e-06, + "loss": 0.8955, + "step": 5940 + }, + { + "epoch": 25.004273504273506, + "grad_norm": 71.28157043457031, + "learning_rate": 8.28584995251662e-06, + "loss": 0.53, + "step": 5950 + }, + { + "epoch": 25.004700854700854, + "grad_norm": 0.19136950373649597, + "learning_rate": 8.28110161443495e-06, + "loss": 0.8647, + "step": 5960 + }, + { + "epoch": 25.005128205128205, + "grad_norm": 0.2358364760875702, + "learning_rate": 8.276353276353277e-06, + "loss": 0.0021, + "step": 5970 + }, + { + "epoch": 25.005555555555556, + "grad_norm": 0.007490760181099176, + "learning_rate": 8.271604938271606e-06, + "loss": 0.6602, + "step": 5980 + }, + { + "epoch": 25.005982905982908, + "grad_norm": 0.006508139427751303, + "learning_rate": 8.266856600189935e-06, + "loss": 0.7927, + "step": 5990 + }, + { + "epoch": 25.006410256410255, + "grad_norm": 0.024243731051683426, + "learning_rate": 8.262108262108262e-06, + "loss": 1.5309, + "step": 6000 + }, + { + "epoch": 25.006837606837607, + "grad_norm": 11.39758586883545, + "learning_rate": 8.257359924026592e-06, + "loss": 0.311, + "step": 6010 + }, + { + "epoch": 25.007264957264958, + "grad_norm": 0.23802872002124786, + "learning_rate": 8.252611585944919e-06, + "loss": 0.0891, + "step": 6020 + }, + { + "epoch": 25.00769230769231, + "grad_norm": 14.19528865814209, + "learning_rate": 8.247863247863248e-06, + "loss": 1.7728, + "step": 6030 + }, + { + "epoch": 25.008119658119657, + "grad_norm": 0.0041798874735832214, + "learning_rate": 8.243114909781577e-06, + "loss": 0.0076, + "step": 6040 + }, + { + "epoch": 25.00854700854701, + "grad_norm": 83.91490936279297, + "learning_rate": 8.238366571699906e-06, + "loss": 1.0939, + "step": 6050 + }, + { + "epoch": 25.00897435897436, + "grad_norm": 109.98046875, + "learning_rate": 8.233618233618234e-06, + "loss": 0.7906, + "step": 6060 + }, + { + "epoch": 25.00940170940171, + "grad_norm": 0.0034200188238173723, + "learning_rate": 8.228869895536563e-06, + "loss": 0.3631, + "step": 6070 + }, + { + "epoch": 25.00982905982906, + "grad_norm": 0.009681742638349533, + "learning_rate": 8.224121557454892e-06, + "loss": 0.3649, + "step": 6080 + }, + { + "epoch": 25.01, + "eval_accuracy": 0.44, + "eval_loss": 3.9465994834899902, + "eval_runtime": 33.084, + "eval_samples_per_second": 0.756, + "eval_steps_per_second": 0.756, + "step": 6084 + }, + { + "epoch": 26.00025641025641, + "grad_norm": 0.19076739251613617, + "learning_rate": 8.21937321937322e-06, + "loss": 0.9254, + "step": 6090 + }, + { + "epoch": 26.00068376068376, + "grad_norm": 0.0032322730403393507, + "learning_rate": 8.214624881291549e-06, + "loss": 0.7628, + "step": 6100 + }, + { + "epoch": 26.00111111111111, + "grad_norm": 78.26629638671875, + "learning_rate": 8.209876543209876e-06, + "loss": 1.0775, + "step": 6110 + }, + { + "epoch": 26.001538461538463, + "grad_norm": 41.407955169677734, + "learning_rate": 8.205128205128205e-06, + "loss": 0.6687, + "step": 6120 + }, + { + "epoch": 26.00196581196581, + "grad_norm": 0.018887581303715706, + "learning_rate": 8.200379867046535e-06, + "loss": 0.6125, + "step": 6130 + }, + { + "epoch": 26.00239316239316, + "grad_norm": 0.004367130342870951, + "learning_rate": 8.195631528964862e-06, + "loss": 0.4048, + "step": 6140 + }, + { + "epoch": 26.002820512820513, + "grad_norm": 0.005417748354375362, + "learning_rate": 8.190883190883193e-06, + "loss": 0.1982, + "step": 6150 + }, + { + "epoch": 26.003247863247864, + "grad_norm": 0.028607884421944618, + "learning_rate": 8.18613485280152e-06, + "loss": 1.1038, + "step": 6160 + }, + { + "epoch": 26.003675213675212, + "grad_norm": 167.5290069580078, + "learning_rate": 8.18138651471985e-06, + "loss": 0.2927, + "step": 6170 + }, + { + "epoch": 26.004102564102563, + "grad_norm": 0.5390312671661377, + "learning_rate": 8.176638176638177e-06, + "loss": 1.0214, + "step": 6180 + }, + { + "epoch": 26.004529914529915, + "grad_norm": 0.007749592885375023, + "learning_rate": 8.171889838556506e-06, + "loss": 0.2385, + "step": 6190 + }, + { + "epoch": 26.004957264957266, + "grad_norm": 0.005856471136212349, + "learning_rate": 8.167141500474835e-06, + "loss": 0.2423, + "step": 6200 + }, + { + "epoch": 26.005384615384614, + "grad_norm": 136.56642150878906, + "learning_rate": 8.162393162393163e-06, + "loss": 0.7251, + "step": 6210 + }, + { + "epoch": 26.005811965811965, + "grad_norm": 482.7048034667969, + "learning_rate": 8.157644824311492e-06, + "loss": 0.7191, + "step": 6220 + }, + { + "epoch": 26.006239316239316, + "grad_norm": 0.00340673909522593, + "learning_rate": 8.15289648622982e-06, + "loss": 0.0008, + "step": 6230 + }, + { + "epoch": 26.006666666666668, + "grad_norm": 0.0029049227014184, + "learning_rate": 8.148148148148148e-06, + "loss": 0.7355, + "step": 6240 + }, + { + "epoch": 26.007094017094015, + "grad_norm": 0.15764747560024261, + "learning_rate": 8.143399810066477e-06, + "loss": 0.8012, + "step": 6250 + }, + { + "epoch": 26.007521367521367, + "grad_norm": 0.00497576454654336, + "learning_rate": 8.138651471984807e-06, + "loss": 0.7658, + "step": 6260 + }, + { + "epoch": 26.007948717948718, + "grad_norm": 0.009462869726121426, + "learning_rate": 8.133903133903136e-06, + "loss": 0.001, + "step": 6270 + }, + { + "epoch": 26.00837606837607, + "grad_norm": 0.006881650071591139, + "learning_rate": 8.129154795821463e-06, + "loss": 0.0038, + "step": 6280 + }, + { + "epoch": 26.008803418803417, + "grad_norm": 0.036327850073575974, + "learning_rate": 8.124406457739792e-06, + "loss": 3.0791, + "step": 6290 + }, + { + "epoch": 26.00923076923077, + "grad_norm": 78.0274658203125, + "learning_rate": 8.11965811965812e-06, + "loss": 0.3814, + "step": 6300 + }, + { + "epoch": 26.00965811965812, + "grad_norm": 0.21120551228523254, + "learning_rate": 8.114909781576449e-06, + "loss": 1.2601, + "step": 6310 + }, + { + "epoch": 26.01, + "eval_accuracy": 0.44, + "eval_loss": 2.9586069583892822, + "eval_runtime": 33.5582, + "eval_samples_per_second": 0.745, + "eval_steps_per_second": 0.745, + "step": 6318 + }, + { + "epoch": 27.00008547008547, + "grad_norm": 67.51969146728516, + "learning_rate": 8.110161443494778e-06, + "loss": 0.9303, + "step": 6320 + }, + { + "epoch": 27.00051282051282, + "grad_norm": 104.40100860595703, + "learning_rate": 8.105413105413106e-06, + "loss": 0.4351, + "step": 6330 + }, + { + "epoch": 27.000940170940172, + "grad_norm": 0.016888683661818504, + "learning_rate": 8.100664767331435e-06, + "loss": 0.3423, + "step": 6340 + }, + { + "epoch": 27.00136752136752, + "grad_norm": 0.011165400967001915, + "learning_rate": 8.095916429249762e-06, + "loss": 1.4128, + "step": 6350 + }, + { + "epoch": 27.00179487179487, + "grad_norm": 0.027417538687586784, + "learning_rate": 8.091168091168093e-06, + "loss": 0.002, + "step": 6360 + }, + { + "epoch": 27.002222222222223, + "grad_norm": 0.09121936559677124, + "learning_rate": 8.08641975308642e-06, + "loss": 2.5068, + "step": 6370 + }, + { + "epoch": 27.002649572649574, + "grad_norm": 0.007114642299711704, + "learning_rate": 8.08167141500475e-06, + "loss": 0.1004, + "step": 6380 + }, + { + "epoch": 27.00307692307692, + "grad_norm": 0.03234013542532921, + "learning_rate": 8.076923076923077e-06, + "loss": 0.2587, + "step": 6390 + }, + { + "epoch": 27.003504273504273, + "grad_norm": 0.06352563947439194, + "learning_rate": 8.072174738841406e-06, + "loss": 0.2638, + "step": 6400 + }, + { + "epoch": 27.003931623931624, + "grad_norm": 0.03514587879180908, + "learning_rate": 8.067426400759735e-06, + "loss": 0.0102, + "step": 6410 + }, + { + "epoch": 27.004358974358976, + "grad_norm": 0.08466164022684097, + "learning_rate": 8.062678062678063e-06, + "loss": 0.3654, + "step": 6420 + }, + { + "epoch": 27.004786324786323, + "grad_norm": 0.01347273774445057, + "learning_rate": 8.057929724596392e-06, + "loss": 0.0482, + "step": 6430 + }, + { + "epoch": 27.005213675213675, + "grad_norm": 25.30817985534668, + "learning_rate": 8.05318138651472e-06, + "loss": 0.0527, + "step": 6440 + }, + { + "epoch": 27.005641025641026, + "grad_norm": 0.003483937354758382, + "learning_rate": 8.048433048433048e-06, + "loss": 0.5832, + "step": 6450 + }, + { + "epoch": 27.006068376068377, + "grad_norm": 0.07294996827840805, + "learning_rate": 8.043684710351378e-06, + "loss": 0.5054, + "step": 6460 + }, + { + "epoch": 27.006495726495725, + "grad_norm": 0.02032245323061943, + "learning_rate": 8.038936372269707e-06, + "loss": 0.2399, + "step": 6470 + }, + { + "epoch": 27.006923076923076, + "grad_norm": 5.702065944671631, + "learning_rate": 8.034188034188036e-06, + "loss": 1.0618, + "step": 6480 + }, + { + "epoch": 27.007350427350428, + "grad_norm": 0.2200225442647934, + "learning_rate": 8.029439696106363e-06, + "loss": 0.4855, + "step": 6490 + }, + { + "epoch": 27.00777777777778, + "grad_norm": 0.02679477632045746, + "learning_rate": 8.024691358024692e-06, + "loss": 0.9622, + "step": 6500 + }, + { + "epoch": 27.008205128205127, + "grad_norm": 0.00978939514607191, + "learning_rate": 8.01994301994302e-06, + "loss": 0.173, + "step": 6510 + }, + { + "epoch": 27.008632478632478, + "grad_norm": 0.009447705931961536, + "learning_rate": 8.015194681861349e-06, + "loss": 0.0833, + "step": 6520 + }, + { + "epoch": 27.00905982905983, + "grad_norm": 6.86177921295166, + "learning_rate": 8.010446343779678e-06, + "loss": 0.3262, + "step": 6530 + }, + { + "epoch": 27.00948717948718, + "grad_norm": 0.2438230961561203, + "learning_rate": 8.005698005698006e-06, + "loss": 0.1945, + "step": 6540 + }, + { + "epoch": 27.00991452991453, + "grad_norm": 0.0071107167750597, + "learning_rate": 8.000949667616335e-06, + "loss": 0.852, + "step": 6550 + }, + { + "epoch": 27.01, + "eval_accuracy": 0.4, + "eval_loss": 4.646361827850342, + "eval_runtime": 33.5078, + "eval_samples_per_second": 0.746, + "eval_steps_per_second": 0.746, + "step": 6552 + }, + { + "epoch": 28.00034188034188, + "grad_norm": 0.007593700196594, + "learning_rate": 7.996201329534662e-06, + "loss": 0.9613, + "step": 6560 + }, + { + "epoch": 28.00076923076923, + "grad_norm": 198.82052612304688, + "learning_rate": 7.991452991452993e-06, + "loss": 0.6948, + "step": 6570 + }, + { + "epoch": 28.00119658119658, + "grad_norm": 0.006188979372382164, + "learning_rate": 7.98670465337132e-06, + "loss": 1.0506, + "step": 6580 + }, + { + "epoch": 28.001623931623932, + "grad_norm": 0.0029619692359119654, + "learning_rate": 7.98195631528965e-06, + "loss": 0.5341, + "step": 6590 + }, + { + "epoch": 28.002051282051283, + "grad_norm": 0.017630403861403465, + "learning_rate": 7.977207977207979e-06, + "loss": 0.775, + "step": 6600 + }, + { + "epoch": 28.00247863247863, + "grad_norm": 0.15129226446151733, + "learning_rate": 7.972459639126306e-06, + "loss": 0.3747, + "step": 6610 + }, + { + "epoch": 28.002905982905983, + "grad_norm": 0.11231208592653275, + "learning_rate": 7.967711301044635e-06, + "loss": 0.0478, + "step": 6620 + }, + { + "epoch": 28.003333333333334, + "grad_norm": 0.01004121731966734, + "learning_rate": 7.962962962962963e-06, + "loss": 0.5073, + "step": 6630 + }, + { + "epoch": 28.003760683760685, + "grad_norm": 0.0024136013817042112, + "learning_rate": 7.958214624881292e-06, + "loss": 1.5819, + "step": 6640 + }, + { + "epoch": 28.004188034188033, + "grad_norm": 329.79754638671875, + "learning_rate": 7.953466286799621e-06, + "loss": 0.5354, + "step": 6650 + }, + { + "epoch": 28.004615384615384, + "grad_norm": 9.606019973754883, + "learning_rate": 7.948717948717949e-06, + "loss": 0.0239, + "step": 6660 + }, + { + "epoch": 28.005042735042736, + "grad_norm": 161.5113067626953, + "learning_rate": 7.943969610636278e-06, + "loss": 1.0728, + "step": 6670 + }, + { + "epoch": 28.005470085470087, + "grad_norm": 0.014358972199261189, + "learning_rate": 7.939221272554607e-06, + "loss": 0.0171, + "step": 6680 + }, + { + "epoch": 28.005897435897435, + "grad_norm": 96.80726623535156, + "learning_rate": 7.934472934472936e-06, + "loss": 0.5362, + "step": 6690 + }, + { + "epoch": 28.006324786324786, + "grad_norm": 0.0033912707585841417, + "learning_rate": 7.929724596391264e-06, + "loss": 0.6892, + "step": 6700 + }, + { + "epoch": 28.006752136752137, + "grad_norm": 11.932380676269531, + "learning_rate": 7.924976258309593e-06, + "loss": 0.4339, + "step": 6710 + }, + { + "epoch": 28.00717948717949, + "grad_norm": 0.026500051841139793, + "learning_rate": 7.92022792022792e-06, + "loss": 1.4133, + "step": 6720 + }, + { + "epoch": 28.007606837606836, + "grad_norm": 0.003824119921773672, + "learning_rate": 7.91547958214625e-06, + "loss": 0.8, + "step": 6730 + }, + { + "epoch": 28.008034188034188, + "grad_norm": 0.02020156756043434, + "learning_rate": 7.910731244064578e-06, + "loss": 0.9052, + "step": 6740 + }, + { + "epoch": 28.00846153846154, + "grad_norm": 80.97819519042969, + "learning_rate": 7.905982905982906e-06, + "loss": 0.2557, + "step": 6750 + }, + { + "epoch": 28.00888888888889, + "grad_norm": 79.75435638427734, + "learning_rate": 7.901234567901235e-06, + "loss": 0.7378, + "step": 6760 + }, + { + "epoch": 28.009316239316238, + "grad_norm": 0.6139788031578064, + "learning_rate": 7.896486229819562e-06, + "loss": 0.595, + "step": 6770 + }, + { + "epoch": 28.00974358974359, + "grad_norm": 181.7918243408203, + "learning_rate": 7.891737891737893e-06, + "loss": 0.6269, + "step": 6780 + }, + { + "epoch": 28.01, + "eval_accuracy": 0.44, + "eval_loss": 3.1291773319244385, + "eval_runtime": 33.0523, + "eval_samples_per_second": 0.756, + "eval_steps_per_second": 0.756, + "step": 6786 + }, + { + "epoch": 29.00017094017094, + "grad_norm": 0.010640930384397507, + "learning_rate": 7.88698955365622e-06, + "loss": 0.8873, + "step": 6790 + }, + { + "epoch": 29.00059829059829, + "grad_norm": 0.007870799861848354, + "learning_rate": 7.88224121557455e-06, + "loss": 0.013, + "step": 6800 + }, + { + "epoch": 29.00102564102564, + "grad_norm": 428.0439453125, + "learning_rate": 7.877492877492879e-06, + "loss": 0.3937, + "step": 6810 + }, + { + "epoch": 29.001452991452993, + "grad_norm": 0.003777164965867996, + "learning_rate": 7.872744539411206e-06, + "loss": 0.2548, + "step": 6820 + }, + { + "epoch": 29.00188034188034, + "grad_norm": 754.8897094726562, + "learning_rate": 7.867996201329536e-06, + "loss": 0.7434, + "step": 6830 + }, + { + "epoch": 29.002307692307692, + "grad_norm": 0.002904111985117197, + "learning_rate": 7.863247863247863e-06, + "loss": 0.6923, + "step": 6840 + }, + { + "epoch": 29.002735042735043, + "grad_norm": 405.4997253417969, + "learning_rate": 7.858499525166192e-06, + "loss": 1.4677, + "step": 6850 + }, + { + "epoch": 29.003162393162395, + "grad_norm": 0.03453909978270531, + "learning_rate": 7.853751187084521e-06, + "loss": 0.2134, + "step": 6860 + }, + { + "epoch": 29.003589743589743, + "grad_norm": 0.002487962134182453, + "learning_rate": 7.849002849002849e-06, + "loss": 0.0651, + "step": 6870 + }, + { + "epoch": 29.004017094017094, + "grad_norm": 0.0023798488546162844, + "learning_rate": 7.844254510921178e-06, + "loss": 0.8198, + "step": 6880 + }, + { + "epoch": 29.004444444444445, + "grad_norm": 0.0023906242568045855, + "learning_rate": 7.839506172839507e-06, + "loss": 1.3586, + "step": 6890 + }, + { + "epoch": 29.004871794871796, + "grad_norm": 0.0033716100733727217, + "learning_rate": 7.834757834757836e-06, + "loss": 0.0181, + "step": 6900 + }, + { + "epoch": 29.005299145299144, + "grad_norm": 260.02825927734375, + "learning_rate": 7.830009496676164e-06, + "loss": 0.287, + "step": 6910 + }, + { + "epoch": 29.005726495726496, + "grad_norm": 0.00520634651184082, + "learning_rate": 7.825261158594493e-06, + "loss": 0.0378, + "step": 6920 + }, + { + "epoch": 29.006153846153847, + "grad_norm": 0.005019763018935919, + "learning_rate": 7.820512820512822e-06, + "loss": 1.2482, + "step": 6930 + }, + { + "epoch": 29.006581196581198, + "grad_norm": 136.67764282226562, + "learning_rate": 7.81576448243115e-06, + "loss": 1.2099, + "step": 6940 + }, + { + "epoch": 29.007008547008546, + "grad_norm": 3.6764883995056152, + "learning_rate": 7.811016144349479e-06, + "loss": 1.5087, + "step": 6950 + }, + { + "epoch": 29.007435897435897, + "grad_norm": 0.004404416773468256, + "learning_rate": 7.806267806267806e-06, + "loss": 0.5894, + "step": 6960 + }, + { + "epoch": 29.00786324786325, + "grad_norm": 0.008280979469418526, + "learning_rate": 7.801519468186135e-06, + "loss": 0.4367, + "step": 6970 + }, + { + "epoch": 29.0082905982906, + "grad_norm": 0.006669474299997091, + "learning_rate": 7.796771130104464e-06, + "loss": 0.5476, + "step": 6980 + }, + { + "epoch": 29.008717948717948, + "grad_norm": 0.012090092524886131, + "learning_rate": 7.792022792022793e-06, + "loss": 1.0435, + "step": 6990 + }, + { + "epoch": 29.0091452991453, + "grad_norm": 0.0026431807782500982, + "learning_rate": 7.787274453941121e-06, + "loss": 0.0031, + "step": 7000 + }, + { + "epoch": 29.00957264957265, + "grad_norm": 0.012273382395505905, + "learning_rate": 7.78252611585945e-06, + "loss": 0.7421, + "step": 7010 + }, + { + "epoch": 29.01, + "grad_norm": 0.3034244179725647, + "learning_rate": 7.77777777777778e-06, + "loss": 1.0013, + "step": 7020 + }, + { + "epoch": 29.01, + "eval_accuracy": 0.4, + "eval_loss": 4.631889820098877, + "eval_runtime": 33.5346, + "eval_samples_per_second": 0.745, + "eval_steps_per_second": 0.745, + "step": 7020 + }, + { + "epoch": 30.00042735042735, + "grad_norm": 1.9434560537338257, + "learning_rate": 7.773029439696107e-06, + "loss": 0.8706, + "step": 7030 + }, + { + "epoch": 30.000854700854703, + "grad_norm": 0.003200115170329809, + "learning_rate": 7.768281101614436e-06, + "loss": 0.5544, + "step": 7040 + }, + { + "epoch": 30.00128205128205, + "grad_norm": 0.005437285173684359, + "learning_rate": 7.763532763532763e-06, + "loss": 1.3336, + "step": 7050 + }, + { + "epoch": 30.0017094017094, + "grad_norm": 0.4712231457233429, + "learning_rate": 7.758784425451092e-06, + "loss": 1.0848, + "step": 7060 + }, + { + "epoch": 30.002136752136753, + "grad_norm": 0.0037900730967521667, + "learning_rate": 7.754036087369421e-06, + "loss": 0.3756, + "step": 7070 + }, + { + "epoch": 30.002564102564104, + "grad_norm": 114.87593841552734, + "learning_rate": 7.749287749287749e-06, + "loss": 0.6611, + "step": 7080 + }, + { + "epoch": 30.002991452991452, + "grad_norm": 0.08566898107528687, + "learning_rate": 7.74453941120608e-06, + "loss": 0.2429, + "step": 7090 + }, + { + "epoch": 30.003418803418803, + "grad_norm": 0.0038280077278614044, + "learning_rate": 7.739791073124407e-06, + "loss": 0.001, + "step": 7100 + }, + { + "epoch": 30.003846153846155, + "grad_norm": 0.008761651813983917, + "learning_rate": 7.735042735042736e-06, + "loss": 0.0407, + "step": 7110 + }, + { + "epoch": 30.004273504273506, + "grad_norm": 0.0022454620338976383, + "learning_rate": 7.730294396961064e-06, + "loss": 0.8858, + "step": 7120 + }, + { + "epoch": 30.004700854700854, + "grad_norm": 0.007320001721382141, + "learning_rate": 7.725546058879393e-06, + "loss": 0.5604, + "step": 7130 + }, + { + "epoch": 30.005128205128205, + "grad_norm": 99.08616638183594, + "learning_rate": 7.720797720797722e-06, + "loss": 0.727, + "step": 7140 + }, + { + "epoch": 30.005555555555556, + "grad_norm": 0.013165280222892761, + "learning_rate": 7.71604938271605e-06, + "loss": 0.0583, + "step": 7150 + }, + { + "epoch": 30.005982905982908, + "grad_norm": 0.0017615576507523656, + "learning_rate": 7.711301044634379e-06, + "loss": 0.435, + "step": 7160 + }, + { + "epoch": 30.006410256410255, + "grad_norm": 0.008409475907683372, + "learning_rate": 7.706552706552706e-06, + "loss": 0.7466, + "step": 7170 + }, + { + "epoch": 30.006837606837607, + "grad_norm": 0.002216171706095338, + "learning_rate": 7.701804368471035e-06, + "loss": 0.6197, + "step": 7180 + }, + { + "epoch": 30.007264957264958, + "grad_norm": 0.015576567500829697, + "learning_rate": 7.697056030389364e-06, + "loss": 0.0003, + "step": 7190 + }, + { + "epoch": 30.00769230769231, + "grad_norm": 0.002197784371674061, + "learning_rate": 7.692307692307694e-06, + "loss": 0.0428, + "step": 7200 + }, + { + "epoch": 30.008119658119657, + "grad_norm": 0.019170358777046204, + "learning_rate": 7.687559354226021e-06, + "loss": 2.7654, + "step": 7210 + }, + { + "epoch": 30.00854700854701, + "grad_norm": 0.0038994126953184605, + "learning_rate": 7.68281101614435e-06, + "loss": 1.1708, + "step": 7220 + }, + { + "epoch": 30.00897435897436, + "grad_norm": 0.17869791388511658, + "learning_rate": 7.67806267806268e-06, + "loss": 0.0286, + "step": 7230 + }, + { + "epoch": 30.00940170940171, + "grad_norm": 0.006502739619463682, + "learning_rate": 7.673314339981007e-06, + "loss": 0.4125, + "step": 7240 + }, + { + "epoch": 30.00982905982906, + "grad_norm": 0.01781412959098816, + "learning_rate": 7.668566001899336e-06, + "loss": 0.02, + "step": 7250 + }, + { + "epoch": 30.01, + "eval_accuracy": 0.4, + "eval_loss": 4.251375198364258, + "eval_runtime": 33.6135, + "eval_samples_per_second": 0.744, + "eval_steps_per_second": 0.744, + "step": 7254 + }, + { + "epoch": 31.00025641025641, + "grad_norm": 0.3418414294719696, + "learning_rate": 7.663817663817665e-06, + "loss": 0.4944, + "step": 7260 + }, + { + "epoch": 31.00068376068376, + "grad_norm": 0.08820147812366486, + "learning_rate": 7.659069325735993e-06, + "loss": 0.6409, + "step": 7270 + }, + { + "epoch": 31.00111111111111, + "grad_norm": 0.004805733449757099, + "learning_rate": 7.654320987654322e-06, + "loss": 0.703, + "step": 7280 + }, + { + "epoch": 31.001538461538463, + "grad_norm": 171.22964477539062, + "learning_rate": 7.649572649572649e-06, + "loss": 0.3897, + "step": 7290 + }, + { + "epoch": 31.00196581196581, + "grad_norm": 0.0735219419002533, + "learning_rate": 7.64482431149098e-06, + "loss": 1.2327, + "step": 7300 + }, + { + "epoch": 31.00239316239316, + "grad_norm": 0.19371773302555084, + "learning_rate": 7.640075973409307e-06, + "loss": 0.0612, + "step": 7310 + }, + { + "epoch": 31.002820512820513, + "grad_norm": 0.011904860846698284, + "learning_rate": 7.635327635327637e-06, + "loss": 0.0011, + "step": 7320 + }, + { + "epoch": 31.003247863247864, + "grad_norm": 53.154327392578125, + "learning_rate": 7.630579297245964e-06, + "loss": 0.032, + "step": 7330 + }, + { + "epoch": 31.003675213675212, + "grad_norm": 0.030124612152576447, + "learning_rate": 7.625830959164293e-06, + "loss": 0.171, + "step": 7340 + }, + { + "epoch": 31.004102564102563, + "grad_norm": 0.005205671768635511, + "learning_rate": 7.6210826210826214e-06, + "loss": 0.5295, + "step": 7350 + }, + { + "epoch": 31.004529914529915, + "grad_norm": 3.4219090938568115, + "learning_rate": 7.61633428300095e-06, + "loss": 0.5748, + "step": 7360 + }, + { + "epoch": 31.004957264957266, + "grad_norm": 0.027771934866905212, + "learning_rate": 7.611585944919279e-06, + "loss": 0.0483, + "step": 7370 + }, + { + "epoch": 31.005384615384614, + "grad_norm": 0.0065915524028241634, + "learning_rate": 7.606837606837607e-06, + "loss": 0.8829, + "step": 7380 + }, + { + "epoch": 31.005811965811965, + "grad_norm": 0.01646728627383709, + "learning_rate": 7.6020892687559355e-06, + "loss": 0.0029, + "step": 7390 + }, + { + "epoch": 31.006239316239316, + "grad_norm": 0.007259257137775421, + "learning_rate": 7.5973409306742655e-06, + "loss": 0.8513, + "step": 7400 + }, + { + "epoch": 31.006666666666668, + "grad_norm": 0.004463976714760065, + "learning_rate": 7.592592592592594e-06, + "loss": 0.4636, + "step": 7410 + }, + { + "epoch": 31.007094017094015, + "grad_norm": 0.0043134731240570545, + "learning_rate": 7.587844254510922e-06, + "loss": 1.0482, + "step": 7420 + }, + { + "epoch": 31.007521367521367, + "grad_norm": 0.006201723124831915, + "learning_rate": 7.58309591642925e-06, + "loss": 0.2689, + "step": 7430 + }, + { + "epoch": 31.007948717948718, + "grad_norm": 17.315887451171875, + "learning_rate": 7.578347578347579e-06, + "loss": 1.319, + "step": 7440 + }, + { + "epoch": 31.00837606837607, + "grad_norm": 0.005171514581888914, + "learning_rate": 7.573599240265908e-06, + "loss": 0.002, + "step": 7450 + }, + { + "epoch": 31.008803418803417, + "grad_norm": 108.37242889404297, + "learning_rate": 7.568850902184236e-06, + "loss": 0.9821, + "step": 7460 + }, + { + "epoch": 31.00923076923077, + "grad_norm": 3.896742820739746, + "learning_rate": 7.564102564102564e-06, + "loss": 1.0268, + "step": 7470 + }, + { + "epoch": 31.00965811965812, + "grad_norm": 0.0028510969132184982, + "learning_rate": 7.559354226020893e-06, + "loss": 0.1333, + "step": 7480 + }, + { + "epoch": 31.01, + "eval_accuracy": 0.4, + "eval_loss": 4.330972671508789, + "eval_runtime": 33.6382, + "eval_samples_per_second": 0.743, + "eval_steps_per_second": 0.743, + "step": 7488 + }, + { + "epoch": 32.00008547008547, + "grad_norm": 0.0023284293711185455, + "learning_rate": 7.554605887939222e-06, + "loss": 0.2603, + "step": 7490 + }, + { + "epoch": 32.00051282051282, + "grad_norm": 0.01587550714612007, + "learning_rate": 7.54985754985755e-06, + "loss": 0.1526, + "step": 7500 + }, + { + "epoch": 32.00094017094017, + "grad_norm": 345.5118408203125, + "learning_rate": 7.545109211775879e-06, + "loss": 0.1876, + "step": 7510 + }, + { + "epoch": 32.00136752136752, + "grad_norm": 0.13611197471618652, + "learning_rate": 7.540360873694208e-06, + "loss": 0.0039, + "step": 7520 + }, + { + "epoch": 32.00179487179487, + "grad_norm": 378.3825988769531, + "learning_rate": 7.535612535612537e-06, + "loss": 0.5279, + "step": 7530 + }, + { + "epoch": 32.00222222222222, + "grad_norm": 0.00310819735750556, + "learning_rate": 7.530864197530865e-06, + "loss": 0.0143, + "step": 7540 + }, + { + "epoch": 32.002649572649574, + "grad_norm": 0.009138006716966629, + "learning_rate": 7.526115859449193e-06, + "loss": 0.0012, + "step": 7550 + }, + { + "epoch": 32.003076923076925, + "grad_norm": 96.70939636230469, + "learning_rate": 7.521367521367522e-06, + "loss": 0.8369, + "step": 7560 + }, + { + "epoch": 32.00350427350428, + "grad_norm": 136.33778381347656, + "learning_rate": 7.516619183285851e-06, + "loss": 0.5351, + "step": 7570 + }, + { + "epoch": 32.00393162393162, + "grad_norm": 0.0020322606433182955, + "learning_rate": 7.511870845204179e-06, + "loss": 1.2159, + "step": 7580 + }, + { + "epoch": 32.00435897435897, + "grad_norm": 0.03725990280508995, + "learning_rate": 7.507122507122507e-06, + "loss": 0.6448, + "step": 7590 + }, + { + "epoch": 32.00478632478632, + "grad_norm": 0.0038324042689055204, + "learning_rate": 7.502374169040836e-06, + "loss": 0.5215, + "step": 7600 + }, + { + "epoch": 32.005213675213675, + "grad_norm": 0.009491757489740849, + "learning_rate": 7.497625830959166e-06, + "loss": 0.8682, + "step": 7610 + }, + { + "epoch": 32.005641025641026, + "grad_norm": 0.005307774059474468, + "learning_rate": 7.492877492877494e-06, + "loss": 0.0158, + "step": 7620 + }, + { + "epoch": 32.00606837606838, + "grad_norm": 0.004275487270206213, + "learning_rate": 7.488129154795822e-06, + "loss": 0.5831, + "step": 7630 + }, + { + "epoch": 32.00649572649573, + "grad_norm": 0.011664590798318386, + "learning_rate": 7.4833808167141505e-06, + "loss": 0.0162, + "step": 7640 + }, + { + "epoch": 32.00692307692308, + "grad_norm": 554.8225708007812, + "learning_rate": 7.47863247863248e-06, + "loss": 0.2277, + "step": 7650 + }, + { + "epoch": 32.007350427350424, + "grad_norm": 0.0021805327851325274, + "learning_rate": 7.473884140550808e-06, + "loss": 2.079, + "step": 7660 + }, + { + "epoch": 32.007777777777775, + "grad_norm": 2.559633255004883, + "learning_rate": 7.469135802469136e-06, + "loss": 0.0108, + "step": 7670 + }, + { + "epoch": 32.00820512820513, + "grad_norm": 15.645346641540527, + "learning_rate": 7.4643874643874645e-06, + "loss": 0.9423, + "step": 7680 + }, + { + "epoch": 32.00863247863248, + "grad_norm": 0.0020319002214819193, + "learning_rate": 7.459639126305793e-06, + "loss": 0.7076, + "step": 7690 + }, + { + "epoch": 32.00905982905983, + "grad_norm": 0.6667153239250183, + "learning_rate": 7.454890788224122e-06, + "loss": 0.5031, + "step": 7700 + }, + { + "epoch": 32.00948717948718, + "grad_norm": 0.0021193181164562702, + "learning_rate": 7.450142450142451e-06, + "loss": 0.3348, + "step": 7710 + }, + { + "epoch": 32.00991452991453, + "grad_norm": 0.005187495611608028, + "learning_rate": 7.445394112060779e-06, + "loss": 0.0005, + "step": 7720 + }, + { + "epoch": 32.01, + "eval_accuracy": 0.4, + "eval_loss": 4.535378932952881, + "eval_runtime": 34.0349, + "eval_samples_per_second": 0.735, + "eval_steps_per_second": 0.735, + "step": 7722 + }, + { + "epoch": 33.00034188034188, + "grad_norm": 0.0027542465832084417, + "learning_rate": 7.4406457739791086e-06, + "loss": 0.267, + "step": 7730 + }, + { + "epoch": 33.00076923076923, + "grad_norm": 0.00943476427346468, + "learning_rate": 7.435897435897437e-06, + "loss": 0.4691, + "step": 7740 + }, + { + "epoch": 33.00119658119658, + "grad_norm": 96.7542724609375, + "learning_rate": 7.431149097815765e-06, + "loss": 0.1307, + "step": 7750 + }, + { + "epoch": 33.00162393162393, + "grad_norm": 0.005492000840604305, + "learning_rate": 7.4264007597340934e-06, + "loss": 0.0305, + "step": 7760 + }, + { + "epoch": 33.00205128205128, + "grad_norm": 12.753386497497559, + "learning_rate": 7.421652421652423e-06, + "loss": 0.5948, + "step": 7770 + }, + { + "epoch": 33.002478632478635, + "grad_norm": 0.0043577225878834724, + "learning_rate": 7.416904083570751e-06, + "loss": 0.6122, + "step": 7780 + }, + { + "epoch": 33.002905982905986, + "grad_norm": 0.00950498878955841, + "learning_rate": 7.412155745489079e-06, + "loss": 0.1552, + "step": 7790 + }, + { + "epoch": 33.00333333333333, + "grad_norm": 146.59999084472656, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.7224, + "step": 7800 + }, + { + "epoch": 33.00376068376068, + "grad_norm": 0.0066252099350094795, + "learning_rate": 7.402659069325736e-06, + "loss": 0.0714, + "step": 7810 + }, + { + "epoch": 33.00418803418803, + "grad_norm": 0.003962312359362841, + "learning_rate": 7.397910731244066e-06, + "loss": 0.1235, + "step": 7820 + }, + { + "epoch": 33.004615384615384, + "grad_norm": 0.004090788774192333, + "learning_rate": 7.393162393162394e-06, + "loss": 0.8119, + "step": 7830 + }, + { + "epoch": 33.005042735042736, + "grad_norm": 0.005719439126551151, + "learning_rate": 7.388414055080722e-06, + "loss": 1.4667, + "step": 7840 + }, + { + "epoch": 33.00547008547009, + "grad_norm": 0.004016694147139788, + "learning_rate": 7.3836657169990515e-06, + "loss": 1.7774, + "step": 7850 + }, + { + "epoch": 33.00589743589744, + "grad_norm": 0.02771218866109848, + "learning_rate": 7.37891737891738e-06, + "loss": 0.1237, + "step": 7860 + }, + { + "epoch": 33.00632478632479, + "grad_norm": 10.215202331542969, + "learning_rate": 7.374169040835708e-06, + "loss": 0.2107, + "step": 7870 + }, + { + "epoch": 33.006752136752134, + "grad_norm": 0.036797523498535156, + "learning_rate": 7.369420702754036e-06, + "loss": 0.3696, + "step": 7880 + }, + { + "epoch": 33.007179487179485, + "grad_norm": 0.0023336284793913364, + "learning_rate": 7.364672364672365e-06, + "loss": 0.0944, + "step": 7890 + }, + { + "epoch": 33.007606837606836, + "grad_norm": 0.002150049665942788, + "learning_rate": 7.359924026590694e-06, + "loss": 0.1512, + "step": 7900 + }, + { + "epoch": 33.00803418803419, + "grad_norm": 0.027237646281719208, + "learning_rate": 7.355175688509022e-06, + "loss": 0.1805, + "step": 7910 + }, + { + "epoch": 33.00846153846154, + "grad_norm": 0.004415407776832581, + "learning_rate": 7.350427350427351e-06, + "loss": 0.6579, + "step": 7920 + }, + { + "epoch": 33.00888888888889, + "grad_norm": 0.009564803913235664, + "learning_rate": 7.34567901234568e-06, + "loss": 0.6692, + "step": 7930 + }, + { + "epoch": 33.00931623931624, + "grad_norm": 0.012014131061732769, + "learning_rate": 7.340930674264009e-06, + "loss": 0.0433, + "step": 7940 + }, + { + "epoch": 33.00974358974359, + "grad_norm": 1.3320039510726929, + "learning_rate": 7.336182336182337e-06, + "loss": 0.004, + "step": 7950 + }, + { + "epoch": 33.01, + "eval_accuracy": 0.4, + "eval_loss": 4.597040176391602, + "eval_runtime": 33.0791, + "eval_samples_per_second": 0.756, + "eval_steps_per_second": 0.756, + "step": 7956 + }, + { + "epoch": 34.00017094017094, + "grad_norm": 0.011348673142492771, + "learning_rate": 7.331433998100665e-06, + "loss": 0.0008, + "step": 7960 + }, + { + "epoch": 34.00059829059829, + "grad_norm": 0.008984715677797794, + "learning_rate": 7.326685660018994e-06, + "loss": 0.026, + "step": 7970 + }, + { + "epoch": 34.00102564102564, + "grad_norm": 0.002917045494541526, + "learning_rate": 7.321937321937323e-06, + "loss": 1.1443, + "step": 7980 + }, + { + "epoch": 34.00145299145299, + "grad_norm": 215.349365234375, + "learning_rate": 7.317188983855651e-06, + "loss": 0.4194, + "step": 7990 + }, + { + "epoch": 34.001880341880344, + "grad_norm": 0.02057277224957943, + "learning_rate": 7.312440645773979e-06, + "loss": 0.3795, + "step": 8000 + }, + { + "epoch": 34.002307692307696, + "grad_norm": 0.009319877251982689, + "learning_rate": 7.307692307692308e-06, + "loss": 1.1649, + "step": 8010 + }, + { + "epoch": 34.00273504273504, + "grad_norm": 0.003309460124000907, + "learning_rate": 7.302943969610636e-06, + "loss": 0.665, + "step": 8020 + }, + { + "epoch": 34.00316239316239, + "grad_norm": 0.0023234267719089985, + "learning_rate": 7.298195631528966e-06, + "loss": 0.1577, + "step": 8030 + }, + { + "epoch": 34.00358974358974, + "grad_norm": 19.653993606567383, + "learning_rate": 7.293447293447294e-06, + "loss": 1.0529, + "step": 8040 + }, + { + "epoch": 34.004017094017094, + "grad_norm": 0.0021447527687996626, + "learning_rate": 7.2886989553656225e-06, + "loss": 0.6675, + "step": 8050 + }, + { + "epoch": 34.004444444444445, + "grad_norm": 0.011748022399842739, + "learning_rate": 7.283950617283952e-06, + "loss": 0.0002, + "step": 8060 + }, + { + "epoch": 34.0048717948718, + "grad_norm": 0.0019833322148770094, + "learning_rate": 7.27920227920228e-06, + "loss": 0.5008, + "step": 8070 + }, + { + "epoch": 34.00529914529915, + "grad_norm": 0.016223294660449028, + "learning_rate": 7.274453941120608e-06, + "loss": 0.0013, + "step": 8080 + }, + { + "epoch": 34.0057264957265, + "grad_norm": 0.001770445262081921, + "learning_rate": 7.2697056030389366e-06, + "loss": 0.761, + "step": 8090 + }, + { + "epoch": 34.00615384615384, + "grad_norm": 0.005002091638743877, + "learning_rate": 7.264957264957266e-06, + "loss": 0.0004, + "step": 8100 + }, + { + "epoch": 34.006581196581195, + "grad_norm": 0.0015637052711099386, + "learning_rate": 7.260208926875594e-06, + "loss": 1.2961, + "step": 8110 + }, + { + "epoch": 34.007008547008546, + "grad_norm": 0.002154473215341568, + "learning_rate": 7.255460588793922e-06, + "loss": 0.0005, + "step": 8120 + }, + { + "epoch": 34.0074358974359, + "grad_norm": 0.004174029920250177, + "learning_rate": 7.2507122507122514e-06, + "loss": 0.2425, + "step": 8130 + }, + { + "epoch": 34.00786324786325, + "grad_norm": 0.0016593949403613806, + "learning_rate": 7.245963912630581e-06, + "loss": 0.7072, + "step": 8140 + }, + { + "epoch": 34.0082905982906, + "grad_norm": 0.003939433954656124, + "learning_rate": 7.241215574548909e-06, + "loss": 0.2379, + "step": 8150 + }, + { + "epoch": 34.00871794871795, + "grad_norm": 0.0038190174382179976, + "learning_rate": 7.236467236467237e-06, + "loss": 0.0005, + "step": 8160 + }, + { + "epoch": 34.0091452991453, + "grad_norm": 0.009816322475671768, + "learning_rate": 7.2317188983855655e-06, + "loss": 0.0006, + "step": 8170 + }, + { + "epoch": 34.00957264957265, + "grad_norm": 0.0018094313563778996, + "learning_rate": 7.226970560303895e-06, + "loss": 0.0108, + "step": 8180 + }, + { + "epoch": 34.01, + "grad_norm": 0.0064643691293895245, + "learning_rate": 7.222222222222223e-06, + "loss": 0.3017, + "step": 8190 + }, + { + "epoch": 34.01, + "eval_accuracy": 0.44, + "eval_loss": 4.587911605834961, + "eval_runtime": 33.8673, + "eval_samples_per_second": 0.738, + "eval_steps_per_second": 0.738, + "step": 8190 + }, + { + "epoch": 35.00042735042735, + "grad_norm": 0.00291146500967443, + "learning_rate": 7.217473884140551e-06, + "loss": 0.002, + "step": 8200 + }, + { + "epoch": 35.0008547008547, + "grad_norm": 0.009560974314808846, + "learning_rate": 7.2127255460588795e-06, + "loss": 0.0003, + "step": 8210 + }, + { + "epoch": 35.001282051282054, + "grad_norm": 66.06437683105469, + "learning_rate": 7.207977207977208e-06, + "loss": 1.0965, + "step": 8220 + }, + { + "epoch": 35.001709401709405, + "grad_norm": 0.0027835587970912457, + "learning_rate": 7.203228869895537e-06, + "loss": 0.0116, + "step": 8230 + }, + { + "epoch": 35.00213675213675, + "grad_norm": 0.022427700459957123, + "learning_rate": 7.198480531813866e-06, + "loss": 0.4445, + "step": 8240 + }, + { + "epoch": 35.0025641025641, + "grad_norm": 6.788797378540039, + "learning_rate": 7.193732193732194e-06, + "loss": 0.7864, + "step": 8250 + }, + { + "epoch": 35.00299145299145, + "grad_norm": 0.002020070794969797, + "learning_rate": 7.1889838556505235e-06, + "loss": 0.7858, + "step": 8260 + }, + { + "epoch": 35.0034188034188, + "grad_norm": 0.08197605609893799, + "learning_rate": 7.184235517568852e-06, + "loss": 0.6455, + "step": 8270 + }, + { + "epoch": 35.003846153846155, + "grad_norm": 0.001466662622988224, + "learning_rate": 7.17948717948718e-06, + "loss": 0.057, + "step": 8280 + }, + { + "epoch": 35.004273504273506, + "grad_norm": 0.014914972707629204, + "learning_rate": 7.174738841405508e-06, + "loss": 0.065, + "step": 8290 + }, + { + "epoch": 35.00470085470086, + "grad_norm": 0.04162408038973808, + "learning_rate": 7.169990503323837e-06, + "loss": 0.0734, + "step": 8300 + }, + { + "epoch": 35.00512820512821, + "grad_norm": 0.33848291635513306, + "learning_rate": 7.165242165242166e-06, + "loss": 0.7488, + "step": 8310 + }, + { + "epoch": 35.00555555555555, + "grad_norm": 0.00879302341490984, + "learning_rate": 7.160493827160494e-06, + "loss": 0.0036, + "step": 8320 + }, + { + "epoch": 35.005982905982904, + "grad_norm": 0.0014278884045779705, + "learning_rate": 7.1557454890788224e-06, + "loss": 0.8021, + "step": 8330 + }, + { + "epoch": 35.006410256410255, + "grad_norm": 0.03050161898136139, + "learning_rate": 7.1509971509971524e-06, + "loss": 0.0002, + "step": 8340 + }, + { + "epoch": 35.00683760683761, + "grad_norm": 0.0032293125987052917, + "learning_rate": 7.146248812915481e-06, + "loss": 0.4796, + "step": 8350 + }, + { + "epoch": 35.00726495726496, + "grad_norm": 0.0025629340671002865, + "learning_rate": 7.141500474833809e-06, + "loss": 0.7034, + "step": 8360 + }, + { + "epoch": 35.00769230769231, + "grad_norm": 0.004474421963095665, + "learning_rate": 7.136752136752137e-06, + "loss": 0.0047, + "step": 8370 + }, + { + "epoch": 35.00811965811966, + "grad_norm": 0.04403112828731537, + "learning_rate": 7.132003798670466e-06, + "loss": 0.9985, + "step": 8380 + }, + { + "epoch": 35.00854700854701, + "grad_norm": 0.002210602629929781, + "learning_rate": 7.127255460588795e-06, + "loss": 0.0011, + "step": 8390 + }, + { + "epoch": 35.008974358974356, + "grad_norm": 413.6798400878906, + "learning_rate": 7.122507122507123e-06, + "loss": 0.378, + "step": 8400 + }, + { + "epoch": 35.00940170940171, + "grad_norm": 0.003414222039282322, + "learning_rate": 7.117758784425451e-06, + "loss": 0.722, + "step": 8410 + }, + { + "epoch": 35.00982905982906, + "grad_norm": 0.0017525185830891132, + "learning_rate": 7.11301044634378e-06, + "loss": 0.2014, + "step": 8420 + }, + { + "epoch": 35.01, + "eval_accuracy": 0.4, + "eval_loss": 4.280939102172852, + "eval_runtime": 33.6967, + "eval_samples_per_second": 0.742, + "eval_steps_per_second": 0.742, + "step": 8424 + }, + { + "epoch": 36.00025641025641, + "grad_norm": 323.9665222167969, + "learning_rate": 7.108262108262109e-06, + "loss": 0.7504, + "step": 8430 + }, + { + "epoch": 36.00068376068376, + "grad_norm": 0.0009274838957935572, + "learning_rate": 7.103513770180438e-06, + "loss": 0.0036, + "step": 8440 + }, + { + "epoch": 36.00111111111111, + "grad_norm": 0.004535932093858719, + "learning_rate": 7.098765432098766e-06, + "loss": 0.0003, + "step": 8450 + }, + { + "epoch": 36.00153846153846, + "grad_norm": 0.007801331579685211, + "learning_rate": 7.0940170940170945e-06, + "loss": 0.0066, + "step": 8460 + }, + { + "epoch": 36.00196581196581, + "grad_norm": 1.9428983926773071, + "learning_rate": 7.089268755935424e-06, + "loss": 0.5418, + "step": 8470 + }, + { + "epoch": 36.00239316239316, + "grad_norm": 0.09623009711503983, + "learning_rate": 7.084520417853752e-06, + "loss": 0.2488, + "step": 8480 + }, + { + "epoch": 36.00282051282051, + "grad_norm": 0.020373547449707985, + "learning_rate": 7.07977207977208e-06, + "loss": 0.0012, + "step": 8490 + }, + { + "epoch": 36.003247863247864, + "grad_norm": 0.002565371571108699, + "learning_rate": 7.0750237416904086e-06, + "loss": 0.0004, + "step": 8500 + }, + { + "epoch": 36.003675213675216, + "grad_norm": 243.3353271484375, + "learning_rate": 7.070275403608738e-06, + "loss": 0.1137, + "step": 8510 + }, + { + "epoch": 36.00410256410257, + "grad_norm": 0.0050058732740581036, + "learning_rate": 7.065527065527066e-06, + "loss": 0.7545, + "step": 8520 + }, + { + "epoch": 36.00452991452991, + "grad_norm": 0.0049826339818537235, + "learning_rate": 7.060778727445394e-06, + "loss": 0.0002, + "step": 8530 + }, + { + "epoch": 36.00495726495726, + "grad_norm": 0.0015328944427892566, + "learning_rate": 7.056030389363723e-06, + "loss": 1.5905, + "step": 8540 + }, + { + "epoch": 36.005384615384614, + "grad_norm": 0.12297794222831726, + "learning_rate": 7.051282051282053e-06, + "loss": 0.0142, + "step": 8550 + }, + { + "epoch": 36.005811965811965, + "grad_norm": 363.6435546875, + "learning_rate": 7.046533713200381e-06, + "loss": 0.4281, + "step": 8560 + }, + { + "epoch": 36.006239316239316, + "grad_norm": 10.307737350463867, + "learning_rate": 7.041785375118709e-06, + "loss": 0.0332, + "step": 8570 + }, + { + "epoch": 36.00666666666667, + "grad_norm": 0.001833245623856783, + "learning_rate": 7.0370370370370375e-06, + "loss": 0.1285, + "step": 8580 + }, + { + "epoch": 36.00709401709402, + "grad_norm": 75.9461441040039, + "learning_rate": 7.032288698955367e-06, + "loss": 0.0096, + "step": 8590 + }, + { + "epoch": 36.00752136752137, + "grad_norm": 0.007748633157461882, + "learning_rate": 7.027540360873695e-06, + "loss": 0.0074, + "step": 8600 + }, + { + "epoch": 36.007948717948715, + "grad_norm": 0.0246294979006052, + "learning_rate": 7.022792022792023e-06, + "loss": 0.0058, + "step": 8610 + }, + { + "epoch": 36.008376068376066, + "grad_norm": 0.07270914316177368, + "learning_rate": 7.0180436847103515e-06, + "loss": 0.0007, + "step": 8620 + }, + { + "epoch": 36.00880341880342, + "grad_norm": 0.0704275518655777, + "learning_rate": 7.01329534662868e-06, + "loss": 0.0986, + "step": 8630 + }, + { + "epoch": 36.00923076923077, + "grad_norm": 0.03107783943414688, + "learning_rate": 7.008547008547009e-06, + "loss": 0.0033, + "step": 8640 + }, + { + "epoch": 36.00965811965812, + "grad_norm": 0.21083621680736542, + "learning_rate": 7.003798670465338e-06, + "loss": 0.1573, + "step": 8650 + }, + { + "epoch": 36.01, + "eval_accuracy": 0.44, + "eval_loss": 4.6822123527526855, + "eval_runtime": 33.5951, + "eval_samples_per_second": 0.744, + "eval_steps_per_second": 0.744, + "step": 8658 + }, + { + "epoch": 37.00008547008547, + "grad_norm": 318.6376953125, + "learning_rate": 6.999050332383666e-06, + "loss": 0.5961, + "step": 8660 + }, + { + "epoch": 37.00051282051282, + "grad_norm": 1.5630220174789429, + "learning_rate": 6.9943019943019955e-06, + "loss": 0.005, + "step": 8670 + }, + { + "epoch": 37.00094017094017, + "grad_norm": 0.00120359449647367, + "learning_rate": 6.989553656220324e-06, + "loss": 0.0008, + "step": 8680 + }, + { + "epoch": 37.00136752136752, + "grad_norm": 0.0014155134558677673, + "learning_rate": 6.984805318138652e-06, + "loss": 0.4491, + "step": 8690 + }, + { + "epoch": 37.00179487179487, + "grad_norm": 0.0017230873927474022, + "learning_rate": 6.9800569800569804e-06, + "loss": 0.0004, + "step": 8700 + }, + { + "epoch": 37.00222222222222, + "grad_norm": 0.07810331135988235, + "learning_rate": 6.975308641975309e-06, + "loss": 0.0007, + "step": 8710 + }, + { + "epoch": 37.002649572649574, + "grad_norm": 0.0017077282536774874, + "learning_rate": 6.970560303893638e-06, + "loss": 0.0988, + "step": 8720 + }, + { + "epoch": 37.003076923076925, + "grad_norm": 0.004637227393686771, + "learning_rate": 6.965811965811966e-06, + "loss": 0.1457, + "step": 8730 + }, + { + "epoch": 37.00350427350428, + "grad_norm": 0.017330659553408623, + "learning_rate": 6.9610636277302945e-06, + "loss": 0.0003, + "step": 8740 + }, + { + "epoch": 37.00393162393162, + "grad_norm": 0.0017195155378431082, + "learning_rate": 6.956315289648623e-06, + "loss": 1.9324, + "step": 8750 + }, + { + "epoch": 37.00435897435897, + "grad_norm": 0.0010282599832862616, + "learning_rate": 6.951566951566953e-06, + "loss": 0.6652, + "step": 8760 + }, + { + "epoch": 37.00478632478632, + "grad_norm": 28.446561813354492, + "learning_rate": 6.946818613485281e-06, + "loss": 0.9279, + "step": 8770 + }, + { + "epoch": 37.005213675213675, + "grad_norm": 0.022886212915182114, + "learning_rate": 6.942070275403609e-06, + "loss": 0.9287, + "step": 8780 + }, + { + "epoch": 37.005641025641026, + "grad_norm": 0.006739874370396137, + "learning_rate": 6.937321937321938e-06, + "loss": 0.0193, + "step": 8790 + }, + { + "epoch": 37.00606837606838, + "grad_norm": 0.0022217826917767525, + "learning_rate": 6.932573599240267e-06, + "loss": 0.3743, + "step": 8800 + }, + { + "epoch": 37.00649572649573, + "grad_norm": 353.1299133300781, + "learning_rate": 6.927825261158595e-06, + "loss": 0.7619, + "step": 8810 + }, + { + "epoch": 37.00692307692308, + "grad_norm": 0.010308923199772835, + "learning_rate": 6.923076923076923e-06, + "loss": 0.5226, + "step": 8820 + }, + { + "epoch": 37.007350427350424, + "grad_norm": 0.386707603931427, + "learning_rate": 6.918328584995252e-06, + "loss": 0.6146, + "step": 8830 + }, + { + "epoch": 37.007777777777775, + "grad_norm": 0.01583043485879898, + "learning_rate": 6.913580246913581e-06, + "loss": 1.2693, + "step": 8840 + }, + { + "epoch": 37.00820512820513, + "grad_norm": 0.41986873745918274, + "learning_rate": 6.908831908831909e-06, + "loss": 0.4624, + "step": 8850 + }, + { + "epoch": 37.00863247863248, + "grad_norm": 0.05952145531773567, + "learning_rate": 6.904083570750238e-06, + "loss": 0.1098, + "step": 8860 + }, + { + "epoch": 37.00905982905983, + "grad_norm": 0.5341818332672119, + "learning_rate": 6.8993352326685666e-06, + "loss": 0.0011, + "step": 8870 + }, + { + "epoch": 37.00948717948718, + "grad_norm": 0.7262808680534363, + "learning_rate": 6.894586894586896e-06, + "loss": 0.1184, + "step": 8880 + }, + { + "epoch": 37.00991452991453, + "grad_norm": 0.006815705914050341, + "learning_rate": 6.889838556505224e-06, + "loss": 0.0041, + "step": 8890 + }, + { + "epoch": 37.01, + "eval_accuracy": 0.4, + "eval_loss": 5.167320728302002, + "eval_runtime": 33.711, + "eval_samples_per_second": 0.742, + "eval_steps_per_second": 0.742, + "step": 8892 + }, + { + "epoch": 38.00034188034188, + "grad_norm": 0.0014624128816649318, + "learning_rate": 6.885090218423552e-06, + "loss": 0.0327, + "step": 8900 + }, + { + "epoch": 38.00076923076923, + "grad_norm": 0.4525696933269501, + "learning_rate": 6.880341880341881e-06, + "loss": 0.0079, + "step": 8910 + }, + { + "epoch": 38.00119658119658, + "grad_norm": 0.0014630717923864722, + "learning_rate": 6.87559354226021e-06, + "loss": 0.0003, + "step": 8920 + }, + { + "epoch": 38.00162393162393, + "grad_norm": 0.009966408833861351, + "learning_rate": 6.870845204178538e-06, + "loss": 0.0023, + "step": 8930 + }, + { + "epoch": 38.00205128205128, + "grad_norm": 0.02271059900522232, + "learning_rate": 6.866096866096866e-06, + "loss": 0.0006, + "step": 8940 + }, + { + "epoch": 38.002478632478635, + "grad_norm": 0.15712451934814453, + "learning_rate": 6.861348528015195e-06, + "loss": 0.0004, + "step": 8950 + }, + { + "epoch": 38.002905982905986, + "grad_norm": 0.011041790246963501, + "learning_rate": 6.856600189933523e-06, + "loss": 1.3034, + "step": 8960 + }, + { + "epoch": 38.00333333333333, + "grad_norm": 0.0015798502136021852, + "learning_rate": 6.851851851851853e-06, + "loss": 0.0009, + "step": 8970 + }, + { + "epoch": 38.00376068376068, + "grad_norm": 0.008027788251638412, + "learning_rate": 6.847103513770181e-06, + "loss": 0.4744, + "step": 8980 + }, + { + "epoch": 38.00418803418803, + "grad_norm": 0.045825887471437454, + "learning_rate": 6.8423551756885095e-06, + "loss": 0.003, + "step": 8990 + }, + { + "epoch": 38.004615384615384, + "grad_norm": 0.0023124797735363245, + "learning_rate": 6.837606837606839e-06, + "loss": 0.0006, + "step": 9000 + }, + { + "epoch": 38.005042735042736, + "grad_norm": 4.002900123596191, + "learning_rate": 6.832858499525167e-06, + "loss": 0.5629, + "step": 9010 + }, + { + "epoch": 38.00547008547009, + "grad_norm": 0.1215251013636589, + "learning_rate": 6.828110161443495e-06, + "loss": 0.0034, + "step": 9020 + }, + { + "epoch": 38.00589743589744, + "grad_norm": 0.007816506549715996, + "learning_rate": 6.8233618233618235e-06, + "loss": 0.3739, + "step": 9030 + }, + { + "epoch": 38.00632478632479, + "grad_norm": 80.12126922607422, + "learning_rate": 6.818613485280152e-06, + "loss": 0.258, + "step": 9040 + }, + { + "epoch": 38.006752136752134, + "grad_norm": 0.0014129136689007282, + "learning_rate": 6.813865147198481e-06, + "loss": 0.0002, + "step": 9050 + }, + { + "epoch": 38.007179487179485, + "grad_norm": 507.7630310058594, + "learning_rate": 6.809116809116809e-06, + "loss": 0.2478, + "step": 9060 + }, + { + "epoch": 38.007606837606836, + "grad_norm": 0.0028367233462631702, + "learning_rate": 6.804368471035138e-06, + "loss": 0.5826, + "step": 9070 + }, + { + "epoch": 38.00803418803419, + "grad_norm": 0.018132785335183144, + "learning_rate": 6.7996201329534676e-06, + "loss": 0.3636, + "step": 9080 + }, + { + "epoch": 38.00846153846154, + "grad_norm": 1.1534035205841064, + "learning_rate": 6.794871794871796e-06, + "loss": 0.454, + "step": 9090 + }, + { + "epoch": 38.00888888888889, + "grad_norm": 0.0010121689410880208, + "learning_rate": 6.790123456790124e-06, + "loss": 0.8389, + "step": 9100 + }, + { + "epoch": 38.00931623931624, + "grad_norm": 0.0008693256531842053, + "learning_rate": 6.7853751187084525e-06, + "loss": 0.0002, + "step": 9110 + }, + { + "epoch": 38.00974358974359, + "grad_norm": 0.001992259407415986, + "learning_rate": 6.780626780626781e-06, + "loss": 0.0001, + "step": 9120 + }, + { + "epoch": 38.01, + "eval_accuracy": 0.4, + "eval_loss": 5.400512218475342, + "eval_runtime": 33.5959, + "eval_samples_per_second": 0.744, + "eval_steps_per_second": 0.744, + "step": 9126 + }, + { + "epoch": 39.00017094017094, + "grad_norm": 0.010901215486228466, + "learning_rate": 6.77587844254511e-06, + "loss": 0.0009, + "step": 9130 + }, + { + "epoch": 39.00059829059829, + "grad_norm": 0.016072046011686325, + "learning_rate": 6.771130104463438e-06, + "loss": 0.1631, + "step": 9140 + }, + { + "epoch": 39.00102564102564, + "grad_norm": 0.017606910318136215, + "learning_rate": 6.7663817663817665e-06, + "loss": 0.3611, + "step": 9150 + }, + { + "epoch": 39.00145299145299, + "grad_norm": 0.07369554787874222, + "learning_rate": 6.761633428300095e-06, + "loss": 0.0008, + "step": 9160 + }, + { + "epoch": 39.001880341880344, + "grad_norm": 0.023543603718280792, + "learning_rate": 6.756885090218425e-06, + "loss": 0.121, + "step": 9170 + }, + { + "epoch": 39.002307692307696, + "grad_norm": 37.938270568847656, + "learning_rate": 6.752136752136753e-06, + "loss": 1.2801, + "step": 9180 + }, + { + "epoch": 39.00273504273504, + "grad_norm": 0.0017556172097101808, + "learning_rate": 6.747388414055081e-06, + "loss": 0.0766, + "step": 9190 + }, + { + "epoch": 39.00316239316239, + "grad_norm": 0.017493102699518204, + "learning_rate": 6.74264007597341e-06, + "loss": 0.0422, + "step": 9200 + }, + { + "epoch": 39.00358974358974, + "grad_norm": 0.0010195322101935744, + "learning_rate": 6.737891737891739e-06, + "loss": 0.0576, + "step": 9210 + }, + { + "epoch": 39.004017094017094, + "grad_norm": 0.009548988193273544, + "learning_rate": 6.733143399810067e-06, + "loss": 0.0003, + "step": 9220 + }, + { + "epoch": 39.004444444444445, + "grad_norm": 0.00347512518055737, + "learning_rate": 6.728395061728395e-06, + "loss": 0.0004, + "step": 9230 + }, + { + "epoch": 39.0048717948718, + "grad_norm": 0.027547018602490425, + "learning_rate": 6.723646723646724e-06, + "loss": 0.0002, + "step": 9240 + }, + { + "epoch": 39.00529914529915, + "grad_norm": 0.005075276829302311, + "learning_rate": 6.718898385565053e-06, + "loss": 0.2796, + "step": 9250 + }, + { + "epoch": 39.0057264957265, + "grad_norm": 0.03868807852268219, + "learning_rate": 6.714150047483381e-06, + "loss": 0.005, + "step": 9260 + }, + { + "epoch": 39.00615384615384, + "grad_norm": 324.1128234863281, + "learning_rate": 6.7094017094017094e-06, + "loss": 0.82, + "step": 9270 + }, + { + "epoch": 39.006581196581195, + "grad_norm": 0.10437753051519394, + "learning_rate": 6.7046533713200394e-06, + "loss": 0.9481, + "step": 9280 + }, + { + "epoch": 39.007008547008546, + "grad_norm": 0.12134591490030289, + "learning_rate": 6.699905033238368e-06, + "loss": 0.0065, + "step": 9290 + }, + { + "epoch": 39.0074358974359, + "grad_norm": 0.029539357870817184, + "learning_rate": 6.695156695156696e-06, + "loss": 0.0004, + "step": 9300 + }, + { + "epoch": 39.00786324786325, + "grad_norm": 205.1163787841797, + "learning_rate": 6.690408357075024e-06, + "loss": 0.4614, + "step": 9310 + }, + { + "epoch": 39.0082905982906, + "grad_norm": 0.008227716200053692, + "learning_rate": 6.685660018993353e-06, + "loss": 0.0715, + "step": 9320 + }, + { + "epoch": 39.00871794871795, + "grad_norm": 292.14813232421875, + "learning_rate": 6.680911680911682e-06, + "loss": 0.2026, + "step": 9330 + }, + { + "epoch": 39.0091452991453, + "grad_norm": 0.0007633490022271872, + "learning_rate": 6.67616334283001e-06, + "loss": 0.1259, + "step": 9340 + }, + { + "epoch": 39.00957264957265, + "grad_norm": 0.0007212384953163564, + "learning_rate": 6.671415004748338e-06, + "loss": 0.05, + "step": 9350 + }, + { + "epoch": 39.01, + "grad_norm": 0.0017658963333815336, + "learning_rate": 6.666666666666667e-06, + "loss": 0.1066, + "step": 9360 + }, + { + "epoch": 39.01, + "eval_accuracy": 0.48, + "eval_loss": 4.4508891105651855, + "eval_runtime": 33.5596, + "eval_samples_per_second": 0.745, + "eval_steps_per_second": 0.745, + "step": 9360 + }, + { + "epoch": 40.00042735042735, + "grad_norm": 0.24033397436141968, + "learning_rate": 6.661918328584995e-06, + "loss": 0.3139, + "step": 9370 + }, + { + "epoch": 40.0008547008547, + "grad_norm": 0.11893019825220108, + "learning_rate": 6.657169990503325e-06, + "loss": 0.07, + "step": 9380 + }, + { + "epoch": 40.001282051282054, + "grad_norm": 262.5067443847656, + "learning_rate": 6.652421652421653e-06, + "loss": 0.4037, + "step": 9390 + }, + { + "epoch": 40.001709401709405, + "grad_norm": 0.05252963304519653, + "learning_rate": 6.6476733143399815e-06, + "loss": 0.3586, + "step": 9400 + }, + { + "epoch": 40.00213675213675, + "grad_norm": 255.7079620361328, + "learning_rate": 6.642924976258311e-06, + "loss": 0.0414, + "step": 9410 + }, + { + "epoch": 40.0025641025641, + "grad_norm": 0.07694090157747269, + "learning_rate": 6.638176638176639e-06, + "loss": 0.3065, + "step": 9420 + }, + { + "epoch": 40.00299145299145, + "grad_norm": 0.0017998847179114819, + "learning_rate": 6.633428300094967e-06, + "loss": 0.0004, + "step": 9430 + }, + { + "epoch": 40.0034188034188, + "grad_norm": 208.48390197753906, + "learning_rate": 6.6286799620132956e-06, + "loss": 0.775, + "step": 9440 + }, + { + "epoch": 40.003846153846155, + "grad_norm": 0.004139886237680912, + "learning_rate": 6.623931623931624e-06, + "loss": 0.0009, + "step": 9450 + }, + { + "epoch": 40.004273504273506, + "grad_norm": 0.0015383479185402393, + "learning_rate": 6.619183285849953e-06, + "loss": 0.0003, + "step": 9460 + }, + { + "epoch": 40.00470085470086, + "grad_norm": 79.24043273925781, + "learning_rate": 6.614434947768281e-06, + "loss": 0.0126, + "step": 9470 + }, + { + "epoch": 40.00512820512821, + "grad_norm": 0.10130419582128525, + "learning_rate": 6.60968660968661e-06, + "loss": 0.0092, + "step": 9480 + }, + { + "epoch": 40.00555555555555, + "grad_norm": 0.0018481501610949636, + "learning_rate": 6.60493827160494e-06, + "loss": 0.0001, + "step": 9490 + }, + { + "epoch": 40.005982905982904, + "grad_norm": 0.00359438662417233, + "learning_rate": 6.600189933523268e-06, + "loss": 0.0015, + "step": 9500 + }, + { + "epoch": 40.006410256410255, + "grad_norm": 0.0007249795598909259, + "learning_rate": 6.595441595441596e-06, + "loss": 0.0001, + "step": 9510 + }, + { + "epoch": 40.00683760683761, + "grad_norm": 0.0008260689792223275, + "learning_rate": 6.5906932573599245e-06, + "loss": 0.5924, + "step": 9520 + }, + { + "epoch": 40.00726495726496, + "grad_norm": 0.0014530919725075364, + "learning_rate": 6.585944919278253e-06, + "loss": 0.7378, + "step": 9530 + }, + { + "epoch": 40.00769230769231, + "grad_norm": 14.047000885009766, + "learning_rate": 6.581196581196582e-06, + "loss": 0.617, + "step": 9540 + }, + { + "epoch": 40.00811965811966, + "grad_norm": 0.00833844393491745, + "learning_rate": 6.57644824311491e-06, + "loss": 0.0337, + "step": 9550 + }, + { + "epoch": 40.00854700854701, + "grad_norm": 0.001020289957523346, + "learning_rate": 6.5716999050332385e-06, + "loss": 0.5866, + "step": 9560 + }, + { + "epoch": 40.008974358974356, + "grad_norm": 0.0010084334062412381, + "learning_rate": 6.566951566951567e-06, + "loss": 0.765, + "step": 9570 + }, + { + "epoch": 40.00940170940171, + "grad_norm": 0.0008349449490197003, + "learning_rate": 6.562203228869896e-06, + "loss": 0.0001, + "step": 9580 + }, + { + "epoch": 40.00982905982906, + "grad_norm": 0.003181006060913205, + "learning_rate": 6.557454890788225e-06, + "loss": 0.0001, + "step": 9590 + }, + { + "epoch": 40.01, + "eval_accuracy": 0.44, + "eval_loss": 5.09063196182251, + "eval_runtime": 33.6154, + "eval_samples_per_second": 0.744, + "eval_steps_per_second": 0.744, + "step": 9594 + }, + { + "epoch": 41.00025641025641, + "grad_norm": 0.027484165504574776, + "learning_rate": 6.552706552706553e-06, + "loss": 0.0001, + "step": 9600 + }, + { + "epoch": 41.00068376068376, + "grad_norm": 0.002781185321509838, + "learning_rate": 6.5479582146248825e-06, + "loss": 0.9085, + "step": 9610 + }, + { + "epoch": 41.00111111111111, + "grad_norm": 0.002277638763189316, + "learning_rate": 6.543209876543211e-06, + "loss": 0.0046, + "step": 9620 + }, + { + "epoch": 41.00153846153846, + "grad_norm": 0.001006715465337038, + "learning_rate": 6.538461538461539e-06, + "loss": 0.2547, + "step": 9630 + }, + { + "epoch": 41.00196581196581, + "grad_norm": 0.017884666100144386, + "learning_rate": 6.533713200379867e-06, + "loss": 0.0003, + "step": 9640 + }, + { + "epoch": 41.00239316239316, + "grad_norm": 0.0043096174485981464, + "learning_rate": 6.528964862298196e-06, + "loss": 0.3413, + "step": 9650 + }, + { + "epoch": 41.00282051282051, + "grad_norm": 0.008827520534396172, + "learning_rate": 6.524216524216525e-06, + "loss": 0.5963, + "step": 9660 + }, + { + "epoch": 41.003247863247864, + "grad_norm": 380.2333679199219, + "learning_rate": 6.519468186134853e-06, + "loss": 0.1933, + "step": 9670 + }, + { + "epoch": 41.003675213675216, + "grad_norm": 53.198787689208984, + "learning_rate": 6.5147198480531815e-06, + "loss": 0.3678, + "step": 9680 + }, + { + "epoch": 41.00410256410257, + "grad_norm": 0.0006724594277329743, + "learning_rate": 6.50997150997151e-06, + "loss": 0.0017, + "step": 9690 + }, + { + "epoch": 41.00452991452991, + "grad_norm": 0.0016128338174894452, + "learning_rate": 6.50522317188984e-06, + "loss": 0.7708, + "step": 9700 + }, + { + "epoch": 41.00495726495726, + "grad_norm": 0.0053380681201815605, + "learning_rate": 6.500474833808168e-06, + "loss": 0.0048, + "step": 9710 + }, + { + "epoch": 41.005384615384614, + "grad_norm": 11.402435302734375, + "learning_rate": 6.495726495726496e-06, + "loss": 0.1091, + "step": 9720 + }, + { + "epoch": 41.005811965811965, + "grad_norm": 0.0011073511559516191, + "learning_rate": 6.490978157644825e-06, + "loss": 0.0594, + "step": 9730 + }, + { + "epoch": 41.006239316239316, + "grad_norm": 0.0016850410029292107, + "learning_rate": 6.486229819563154e-06, + "loss": 0.5871, + "step": 9740 + }, + { + "epoch": 41.00666666666667, + "grad_norm": 0.011738145723938942, + "learning_rate": 6.481481481481482e-06, + "loss": 0.0002, + "step": 9750 + }, + { + "epoch": 41.00709401709402, + "grad_norm": 0.02577214501798153, + "learning_rate": 6.47673314339981e-06, + "loss": 0.7295, + "step": 9760 + }, + { + "epoch": 41.00752136752137, + "grad_norm": 0.0008236413705162704, + "learning_rate": 6.471984805318139e-06, + "loss": 0.0001, + "step": 9770 + }, + { + "epoch": 41.007948717948715, + "grad_norm": 0.0007155478815548122, + "learning_rate": 6.467236467236467e-06, + "loss": 0.2632, + "step": 9780 + }, + { + "epoch": 41.008376068376066, + "grad_norm": 0.0013219125103205442, + "learning_rate": 6.462488129154796e-06, + "loss": 0.0001, + "step": 9790 + }, + { + "epoch": 41.00880341880342, + "grad_norm": 203.72512817382812, + "learning_rate": 6.457739791073125e-06, + "loss": 0.8773, + "step": 9800 + }, + { + "epoch": 41.00923076923077, + "grad_norm": 0.0009469674550928175, + "learning_rate": 6.4529914529914535e-06, + "loss": 0.0001, + "step": 9810 + }, + { + "epoch": 41.00965811965812, + "grad_norm": 0.013066309504210949, + "learning_rate": 6.448243114909783e-06, + "loss": 1.3235, + "step": 9820 + }, + { + "epoch": 41.01, + "eval_accuracy": 0.48, + "eval_loss": 4.409305572509766, + "eval_runtime": 33.7085, + "eval_samples_per_second": 0.742, + "eval_steps_per_second": 0.742, + "step": 9828 + }, + { + "epoch": 42.00008547008547, + "grad_norm": 0.0041074506007134914, + "learning_rate": 6.443494776828111e-06, + "loss": 0.0003, + "step": 9830 + }, + { + "epoch": 42.00051282051282, + "grad_norm": 0.0007542824023403227, + "learning_rate": 6.438746438746439e-06, + "loss": 0.1122, + "step": 9840 + }, + { + "epoch": 42.00094017094017, + "grad_norm": 0.0015057259006425738, + "learning_rate": 6.433998100664768e-06, + "loss": 0.0002, + "step": 9850 + }, + { + "epoch": 42.00136752136752, + "grad_norm": 0.003451643278822303, + "learning_rate": 6.429249762583096e-06, + "loss": 0.0002, + "step": 9860 + }, + { + "epoch": 42.00179487179487, + "grad_norm": 0.0011182624148204923, + "learning_rate": 6.424501424501425e-06, + "loss": 0.0045, + "step": 9870 + }, + { + "epoch": 42.00222222222222, + "grad_norm": 0.0010131365852430463, + "learning_rate": 6.419753086419753e-06, + "loss": 0.0001, + "step": 9880 + }, + { + "epoch": 42.002649572649574, + "grad_norm": 0.0007555413176305592, + "learning_rate": 6.415004748338082e-06, + "loss": 0.0006, + "step": 9890 + }, + { + "epoch": 42.003076923076925, + "grad_norm": 0.0009690074366517365, + "learning_rate": 6.410256410256412e-06, + "loss": 0.0002, + "step": 9900 + }, + { + "epoch": 42.00350427350428, + "grad_norm": 0.0007182178669609129, + "learning_rate": 6.40550807217474e-06, + "loss": 0.1885, + "step": 9910 + }, + { + "epoch": 42.00393162393162, + "grad_norm": 0.0008008884033188224, + "learning_rate": 6.400759734093068e-06, + "loss": 0.0, + "step": 9920 + }, + { + "epoch": 42.00435897435897, + "grad_norm": 0.00729968398809433, + "learning_rate": 6.3960113960113965e-06, + "loss": 0.9073, + "step": 9930 + }, + { + "epoch": 42.00478632478632, + "grad_norm": 0.0006498902221210301, + "learning_rate": 6.391263057929726e-06, + "loss": 0.0001, + "step": 9940 + }, + { + "epoch": 42.005213675213675, + "grad_norm": 0.00066241534659639, + "learning_rate": 6.386514719848054e-06, + "loss": 1.0737, + "step": 9950 + }, + { + "epoch": 42.005641025641026, + "grad_norm": 330.1017150878906, + "learning_rate": 6.381766381766382e-06, + "loss": 1.9267, + "step": 9960 + }, + { + "epoch": 42.00606837606838, + "grad_norm": 0.01651351898908615, + "learning_rate": 6.3770180436847105e-06, + "loss": 0.0003, + "step": 9970 + }, + { + "epoch": 42.00649572649573, + "grad_norm": 0.005164479836821556, + "learning_rate": 6.372269705603039e-06, + "loss": 0.0008, + "step": 9980 + }, + { + "epoch": 42.00692307692308, + "grad_norm": 425.1309814453125, + "learning_rate": 6.367521367521368e-06, + "loss": 0.3781, + "step": 9990 + }, + { + "epoch": 42.007350427350424, + "grad_norm": 0.014773045666515827, + "learning_rate": 6.362773029439696e-06, + "loss": 0.0002, + "step": 10000 + }, + { + "epoch": 42.007777777777775, + "grad_norm": 0.008133858442306519, + "learning_rate": 6.358024691358025e-06, + "loss": 0.0166, + "step": 10010 + }, + { + "epoch": 42.00820512820513, + "grad_norm": 0.4493364691734314, + "learning_rate": 6.3532763532763546e-06, + "loss": 1.622, + "step": 10020 + }, + { + "epoch": 42.00863247863248, + "grad_norm": 0.007051916792988777, + "learning_rate": 6.348528015194683e-06, + "loss": 0.0094, + "step": 10030 + }, + { + "epoch": 42.00905982905983, + "grad_norm": 0.003875425783917308, + "learning_rate": 6.343779677113011e-06, + "loss": 0.0892, + "step": 10040 + }, + { + "epoch": 42.00948717948718, + "grad_norm": 0.0009031430818140507, + "learning_rate": 6.3390313390313394e-06, + "loss": 0.0001, + "step": 10050 + }, + { + "epoch": 42.00991452991453, + "grad_norm": 392.3472595214844, + "learning_rate": 6.334283000949668e-06, + "loss": 0.4313, + "step": 10060 + }, + { + "epoch": 42.01, + "eval_accuracy": 0.48, + "eval_loss": 4.089754581451416, + "eval_runtime": 33.5482, + "eval_samples_per_second": 0.745, + "eval_steps_per_second": 0.745, + "step": 10062 + }, + { + "epoch": 43.00034188034188, + "grad_norm": 0.004300788510590792, + "learning_rate": 6.329534662867997e-06, + "loss": 0.0001, + "step": 10070 + }, + { + "epoch": 43.00076923076923, + "grad_norm": 0.0026491975877434015, + "learning_rate": 6.324786324786325e-06, + "loss": 0.004, + "step": 10080 + }, + { + "epoch": 43.00119658119658, + "grad_norm": 0.002639173995703459, + "learning_rate": 6.3200379867046535e-06, + "loss": 0.0006, + "step": 10090 + }, + { + "epoch": 43.00162393162393, + "grad_norm": 0.0024889677297323942, + "learning_rate": 6.315289648622982e-06, + "loss": 0.0006, + "step": 10100 + }, + { + "epoch": 43.00205128205128, + "grad_norm": 0.0487215556204319, + "learning_rate": 6.310541310541312e-06, + "loss": 0.0001, + "step": 10110 + }, + { + "epoch": 43.002478632478635, + "grad_norm": 0.0023086927831172943, + "learning_rate": 6.30579297245964e-06, + "loss": 0.307, + "step": 10120 + }, + { + "epoch": 43.002905982905986, + "grad_norm": 0.007922650314867496, + "learning_rate": 6.301044634377968e-06, + "loss": 0.3353, + "step": 10130 + }, + { + "epoch": 43.00333333333333, + "grad_norm": 0.0018696904880926013, + "learning_rate": 6.296296296296297e-06, + "loss": 0.0131, + "step": 10140 + }, + { + "epoch": 43.00376068376068, + "grad_norm": 0.027236852794885635, + "learning_rate": 6.291547958214626e-06, + "loss": 0.017, + "step": 10150 + }, + { + "epoch": 43.00418803418803, + "grad_norm": 0.0007559856749139726, + "learning_rate": 6.286799620132954e-06, + "loss": 0.0005, + "step": 10160 + }, + { + "epoch": 43.004615384615384, + "grad_norm": 11.704588890075684, + "learning_rate": 6.282051282051282e-06, + "loss": 0.0014, + "step": 10170 + }, + { + "epoch": 43.005042735042736, + "grad_norm": 0.06166832521557808, + "learning_rate": 6.277302943969611e-06, + "loss": 0.9794, + "step": 10180 + }, + { + "epoch": 43.00547008547009, + "grad_norm": 0.0030286931432783604, + "learning_rate": 6.272554605887939e-06, + "loss": 0.2002, + "step": 10190 + }, + { + "epoch": 43.00589743589744, + "grad_norm": 0.006914252880960703, + "learning_rate": 6.267806267806268e-06, + "loss": 0.0001, + "step": 10200 + }, + { + "epoch": 43.00632478632479, + "grad_norm": 149.87620544433594, + "learning_rate": 6.2630579297245964e-06, + "loss": 1.023, + "step": 10210 + }, + { + "epoch": 43.006752136752134, + "grad_norm": 0.0074666948057711124, + "learning_rate": 6.2583095916429256e-06, + "loss": 0.2019, + "step": 10220 + }, + { + "epoch": 43.007179487179485, + "grad_norm": 0.0010838122107088566, + "learning_rate": 6.253561253561255e-06, + "loss": 0.0011, + "step": 10230 + }, + { + "epoch": 43.007606837606836, + "grad_norm": 0.10124364495277405, + "learning_rate": 6.248812915479583e-06, + "loss": 0.3781, + "step": 10240 + }, + { + "epoch": 43.00803418803419, + "grad_norm": 0.0029677078127861023, + "learning_rate": 6.244064577397911e-06, + "loss": 0.1722, + "step": 10250 + }, + { + "epoch": 43.00846153846154, + "grad_norm": 0.012870069593191147, + "learning_rate": 6.23931623931624e-06, + "loss": 0.0001, + "step": 10260 + }, + { + "epoch": 43.00888888888889, + "grad_norm": 0.0010046313982456923, + "learning_rate": 6.234567901234569e-06, + "loss": 0.0003, + "step": 10270 + }, + { + "epoch": 43.00931623931624, + "grad_norm": 0.0031989836134016514, + "learning_rate": 6.229819563152897e-06, + "loss": 0.0002, + "step": 10280 + }, + { + "epoch": 43.00974358974359, + "grad_norm": 0.0007750399527139962, + "learning_rate": 6.225071225071225e-06, + "loss": 0.0002, + "step": 10290 + }, + { + "epoch": 43.01, + "eval_accuracy": 0.44, + "eval_loss": 4.78167200088501, + "eval_runtime": 33.6921, + "eval_samples_per_second": 0.742, + "eval_steps_per_second": 0.742, + "step": 10296 + }, + { + "epoch": 44.00017094017094, + "grad_norm": 0.0008096142555586994, + "learning_rate": 6.220322886989554e-06, + "loss": 0.7658, + "step": 10300 + }, + { + "epoch": 44.00059829059829, + "grad_norm": 0.001107421237975359, + "learning_rate": 6.215574548907882e-06, + "loss": 0.0011, + "step": 10310 + }, + { + "epoch": 44.00102564102564, + "grad_norm": 147.27357482910156, + "learning_rate": 6.210826210826212e-06, + "loss": 0.8891, + "step": 10320 + }, + { + "epoch": 44.00145299145299, + "grad_norm": 0.21983663737773895, + "learning_rate": 6.20607787274454e-06, + "loss": 0.0002, + "step": 10330 + }, + { + "epoch": 44.001880341880344, + "grad_norm": 0.0018023523734882474, + "learning_rate": 6.2013295346628685e-06, + "loss": 0.0806, + "step": 10340 + }, + { + "epoch": 44.002307692307696, + "grad_norm": 0.0013478569453582168, + "learning_rate": 6.196581196581198e-06, + "loss": 0.442, + "step": 10350 + }, + { + "epoch": 44.00273504273504, + "grad_norm": 0.0480712465941906, + "learning_rate": 6.191832858499526e-06, + "loss": 1.1859, + "step": 10360 + }, + { + "epoch": 44.00316239316239, + "grad_norm": 0.0018559067975729704, + "learning_rate": 6.187084520417854e-06, + "loss": 0.8634, + "step": 10370 + }, + { + "epoch": 44.00358974358974, + "grad_norm": 0.0008481431868858635, + "learning_rate": 6.1823361823361825e-06, + "loss": 0.8489, + "step": 10380 + }, + { + "epoch": 44.004017094017094, + "grad_norm": 0.003446524264290929, + "learning_rate": 6.177587844254511e-06, + "loss": 0.5956, + "step": 10390 + }, + { + "epoch": 44.004444444444445, + "grad_norm": 0.0011318651959300041, + "learning_rate": 6.17283950617284e-06, + "loss": 0.0008, + "step": 10400 + }, + { + "epoch": 44.0048717948718, + "grad_norm": 0.002299872925505042, + "learning_rate": 6.168091168091168e-06, + "loss": 0.0322, + "step": 10410 + }, + { + "epoch": 44.00529914529915, + "grad_norm": 0.002232542959973216, + "learning_rate": 6.1633428300094974e-06, + "loss": 1.0857, + "step": 10420 + }, + { + "epoch": 44.0057264957265, + "grad_norm": 0.1128619983792305, + "learning_rate": 6.1585944919278266e-06, + "loss": 0.0205, + "step": 10430 + }, + { + "epoch": 44.00615384615384, + "grad_norm": 0.12662559747695923, + "learning_rate": 6.153846153846155e-06, + "loss": 0.876, + "step": 10440 + }, + { + "epoch": 44.006581196581195, + "grad_norm": 0.009139850735664368, + "learning_rate": 6.149097815764483e-06, + "loss": 0.0007, + "step": 10450 + }, + { + "epoch": 44.007008547008546, + "grad_norm": 0.0021971724927425385, + "learning_rate": 6.1443494776828115e-06, + "loss": 0.0002, + "step": 10460 + }, + { + "epoch": 44.0074358974359, + "grad_norm": 0.00538423378020525, + "learning_rate": 6.13960113960114e-06, + "loss": 0.0001, + "step": 10470 + }, + { + "epoch": 44.00786324786325, + "grad_norm": 0.026287034153938293, + "learning_rate": 6.134852801519469e-06, + "loss": 0.0432, + "step": 10480 + }, + { + "epoch": 44.0082905982906, + "grad_norm": 0.0007875883602537215, + "learning_rate": 6.130104463437797e-06, + "loss": 0.266, + "step": 10490 + }, + { + "epoch": 44.00871794871795, + "grad_norm": 0.01668292097747326, + "learning_rate": 6.1253561253561255e-06, + "loss": 0.0248, + "step": 10500 + }, + { + "epoch": 44.0091452991453, + "grad_norm": 0.025921987369656563, + "learning_rate": 6.120607787274454e-06, + "loss": 0.0002, + "step": 10510 + }, + { + "epoch": 44.00957264957265, + "grad_norm": 0.0005760848871432245, + "learning_rate": 6.115859449192783e-06, + "loss": 0.0901, + "step": 10520 + }, + { + "epoch": 44.01, + "grad_norm": 0.0007085147080942988, + "learning_rate": 6.111111111111112e-06, + "loss": 0.0001, + "step": 10530 + }, + { + "epoch": 44.01, + "eval_accuracy": 0.48, + "eval_loss": 4.866738319396973, + "eval_runtime": 33.6315, + "eval_samples_per_second": 0.743, + "eval_steps_per_second": 0.743, + "step": 10530 + }, + { + "epoch": 45.00042735042735, + "grad_norm": 0.013209497556090355, + "learning_rate": 6.10636277302944e-06, + "loss": 0.2852, + "step": 10540 + }, + { + "epoch": 45.0008547008547, + "grad_norm": 0.0015938766300678253, + "learning_rate": 6.101614434947769e-06, + "loss": 0.0001, + "step": 10550 + }, + { + "epoch": 45.001282051282054, + "grad_norm": 0.05549630522727966, + "learning_rate": 6.096866096866098e-06, + "loss": 0.9446, + "step": 10560 + }, + { + "epoch": 45.001709401709405, + "grad_norm": 0.0019539748318493366, + "learning_rate": 6.092117758784426e-06, + "loss": 0.0001, + "step": 10570 + }, + { + "epoch": 45.00213675213675, + "grad_norm": 0.0008734731236472726, + "learning_rate": 6.087369420702754e-06, + "loss": 0.0078, + "step": 10580 + }, + { + "epoch": 45.0025641025641, + "grad_norm": 0.0014504172140732408, + "learning_rate": 6.082621082621083e-06, + "loss": 0.0003, + "step": 10590 + }, + { + "epoch": 45.00299145299145, + "grad_norm": 0.0006624649395234883, + "learning_rate": 6.077872744539412e-06, + "loss": 0.0005, + "step": 10600 + }, + { + "epoch": 45.0034188034188, + "grad_norm": 0.0008454503258690238, + "learning_rate": 6.07312440645774e-06, + "loss": 0.7002, + "step": 10610 + }, + { + "epoch": 45.003846153846155, + "grad_norm": 0.0008091210620477796, + "learning_rate": 6.0683760683760684e-06, + "loss": 0.0006, + "step": 10620 + }, + { + "epoch": 45.004273504273506, + "grad_norm": 0.0016585165867581964, + "learning_rate": 6.063627730294398e-06, + "loss": 0.0001, + "step": 10630 + }, + { + "epoch": 45.00470085470086, + "grad_norm": 0.0022859878372401, + "learning_rate": 6.058879392212727e-06, + "loss": 0.1929, + "step": 10640 + }, + { + "epoch": 45.00512820512821, + "grad_norm": 604.373291015625, + "learning_rate": 6.054131054131055e-06, + "loss": 0.6576, + "step": 10650 + }, + { + "epoch": 45.00555555555555, + "grad_norm": 0.015343650244176388, + "learning_rate": 6.049382716049383e-06, + "loss": 0.555, + "step": 10660 + }, + { + "epoch": 45.005982905982904, + "grad_norm": 0.0035480374936014414, + "learning_rate": 6.044634377967712e-06, + "loss": 0.0039, + "step": 10670 + }, + { + "epoch": 45.006410256410255, + "grad_norm": 0.0072784130461514, + "learning_rate": 6.039886039886041e-06, + "loss": 0.0001, + "step": 10680 + }, + { + "epoch": 45.00683760683761, + "grad_norm": 0.0026009362190961838, + "learning_rate": 6.035137701804369e-06, + "loss": 0.0002, + "step": 10690 + }, + { + "epoch": 45.00726495726496, + "grad_norm": 0.000990421511232853, + "learning_rate": 6.030389363722697e-06, + "loss": 0.0012, + "step": 10700 + }, + { + "epoch": 45.00769230769231, + "grad_norm": 0.0006937840371392667, + "learning_rate": 6.025641025641026e-06, + "loss": 0.0, + "step": 10710 + }, + { + "epoch": 45.00811965811966, + "grad_norm": 0.001313463319092989, + "learning_rate": 6.020892687559354e-06, + "loss": 0.0073, + "step": 10720 + }, + { + "epoch": 45.00854700854701, + "grad_norm": 0.02181072160601616, + "learning_rate": 6.016144349477683e-06, + "loss": 0.0001, + "step": 10730 + }, + { + "epoch": 45.008974358974356, + "grad_norm": 0.2195536494255066, + "learning_rate": 6.011396011396012e-06, + "loss": 0.0003, + "step": 10740 + }, + { + "epoch": 45.00940170940171, + "grad_norm": 0.006073196418583393, + "learning_rate": 6.0066476733143405e-06, + "loss": 0.0004, + "step": 10750 + }, + { + "epoch": 45.00982905982906, + "grad_norm": 0.0016091590514406562, + "learning_rate": 6.00189933523267e-06, + "loss": 0.0007, + "step": 10760 + }, + { + "epoch": 45.01, + "eval_accuracy": 0.48, + "eval_loss": 4.561890125274658, + "eval_runtime": 34.2721, + "eval_samples_per_second": 0.729, + "eval_steps_per_second": 0.729, + "step": 10764 + }, + { + "epoch": 46.00025641025641, + "grad_norm": 0.0011347552062943578, + "learning_rate": 5.997150997150998e-06, + "loss": 0.7172, + "step": 10770 + }, + { + "epoch": 46.00068376068376, + "grad_norm": 375.4585266113281, + "learning_rate": 5.992402659069326e-06, + "loss": 0.821, + "step": 10780 + }, + { + "epoch": 46.00111111111111, + "grad_norm": 0.0010995555203408003, + "learning_rate": 5.9876543209876546e-06, + "loss": 0.1693, + "step": 10790 + }, + { + "epoch": 46.00153846153846, + "grad_norm": 0.11372661590576172, + "learning_rate": 5.982905982905983e-06, + "loss": 0.0002, + "step": 10800 + }, + { + "epoch": 46.00196581196581, + "grad_norm": 0.0009383049909956753, + "learning_rate": 5.978157644824312e-06, + "loss": 0.0001, + "step": 10810 + }, + { + "epoch": 46.00239316239316, + "grad_norm": 0.004879310727119446, + "learning_rate": 5.97340930674264e-06, + "loss": 0.0005, + "step": 10820 + }, + { + "epoch": 46.00282051282051, + "grad_norm": 0.00243366789072752, + "learning_rate": 5.968660968660969e-06, + "loss": 0.1072, + "step": 10830 + }, + { + "epoch": 46.003247863247864, + "grad_norm": 0.0007890159031376243, + "learning_rate": 5.963912630579299e-06, + "loss": 0.0003, + "step": 10840 + }, + { + "epoch": 46.003675213675216, + "grad_norm": 0.009691433981060982, + "learning_rate": 5.959164292497627e-06, + "loss": 0.4108, + "step": 10850 + }, + { + "epoch": 46.00410256410257, + "grad_norm": 0.003631254890933633, + "learning_rate": 5.954415954415955e-06, + "loss": 0.0001, + "step": 10860 + }, + { + "epoch": 46.00452991452991, + "grad_norm": 0.0005667143850587308, + "learning_rate": 5.9496676163342835e-06, + "loss": 1.0405, + "step": 10870 + }, + { + "epoch": 46.00495726495726, + "grad_norm": 0.0033552220556885004, + "learning_rate": 5.944919278252612e-06, + "loss": 0.0001, + "step": 10880 + }, + { + "epoch": 46.005384615384614, + "grad_norm": 0.07237747311592102, + "learning_rate": 5.940170940170941e-06, + "loss": 0.6576, + "step": 10890 + }, + { + "epoch": 46.005811965811965, + "grad_norm": 0.0010696067474782467, + "learning_rate": 5.935422602089269e-06, + "loss": 0.0001, + "step": 10900 + }, + { + "epoch": 46.006239316239316, + "grad_norm": 0.0015365901635959744, + "learning_rate": 5.9306742640075975e-06, + "loss": 0.7127, + "step": 10910 + }, + { + "epoch": 46.00666666666667, + "grad_norm": 0.002339027589187026, + "learning_rate": 5.925925925925926e-06, + "loss": 0.0023, + "step": 10920 + }, + { + "epoch": 46.00709401709402, + "grad_norm": 0.01849093660712242, + "learning_rate": 5.921177587844255e-06, + "loss": 0.4024, + "step": 10930 + }, + { + "epoch": 46.00752136752137, + "grad_norm": 0.009332343935966492, + "learning_rate": 5.916429249762583e-06, + "loss": 0.0002, + "step": 10940 + }, + { + "epoch": 46.007948717948715, + "grad_norm": 0.009404930286109447, + "learning_rate": 5.911680911680912e-06, + "loss": 0.001, + "step": 10950 + }, + { + "epoch": 46.008376068376066, + "grad_norm": 0.0012134173884987831, + "learning_rate": 5.906932573599241e-06, + "loss": 0.5511, + "step": 10960 + }, + { + "epoch": 46.00880341880342, + "grad_norm": 0.01179675292223692, + "learning_rate": 5.90218423551757e-06, + "loss": 0.0018, + "step": 10970 + }, + { + "epoch": 46.00923076923077, + "grad_norm": 0.0012383083812892437, + "learning_rate": 5.897435897435898e-06, + "loss": 0.7245, + "step": 10980 + }, + { + "epoch": 46.00965811965812, + "grad_norm": 0.001664446317590773, + "learning_rate": 5.8926875593542264e-06, + "loss": 0.0009, + "step": 10990 + }, + { + "epoch": 46.01, + "eval_accuracy": 0.44, + "eval_loss": 5.025023937225342, + "eval_runtime": 33.6102, + "eval_samples_per_second": 0.744, + "eval_steps_per_second": 0.744, + "step": 10998 + }, + { + "epoch": 47.00008547008547, + "grad_norm": 0.002813153900206089, + "learning_rate": 5.887939221272555e-06, + "loss": 0.6121, + "step": 11000 + }, + { + "epoch": 47.00051282051282, + "grad_norm": 0.002290428848937154, + "learning_rate": 5.883190883190884e-06, + "loss": 0.0002, + "step": 11010 + }, + { + "epoch": 47.00094017094017, + "grad_norm": 0.013784612528979778, + "learning_rate": 5.878442545109212e-06, + "loss": 0.0041, + "step": 11020 + }, + { + "epoch": 47.00136752136752, + "grad_norm": 0.008057618513703346, + "learning_rate": 5.8736942070275405e-06, + "loss": 0.0002, + "step": 11030 + }, + { + "epoch": 47.00179487179487, + "grad_norm": 0.0026002710219472647, + "learning_rate": 5.868945868945869e-06, + "loss": 0.9305, + "step": 11040 + }, + { + "epoch": 47.00222222222222, + "grad_norm": 375.0780029296875, + "learning_rate": 5.864197530864199e-06, + "loss": 0.9073, + "step": 11050 + }, + { + "epoch": 47.002649572649574, + "grad_norm": 0.002601037034764886, + "learning_rate": 5.859449192782527e-06, + "loss": 0.1539, + "step": 11060 + }, + { + "epoch": 47.003076923076925, + "grad_norm": 0.013979580253362656, + "learning_rate": 5.854700854700855e-06, + "loss": 0.0031, + "step": 11070 + }, + { + "epoch": 47.00350427350428, + "grad_norm": 0.003610535990446806, + "learning_rate": 5.849952516619184e-06, + "loss": 0.0126, + "step": 11080 + }, + { + "epoch": 47.00393162393162, + "grad_norm": 0.0018818234093487263, + "learning_rate": 5.845204178537513e-06, + "loss": 0.0001, + "step": 11090 + }, + { + "epoch": 47.00435897435897, + "grad_norm": 209.32333374023438, + "learning_rate": 5.840455840455841e-06, + "loss": 1.5139, + "step": 11100 + }, + { + "epoch": 47.00478632478632, + "grad_norm": 0.002293006982654333, + "learning_rate": 5.835707502374169e-06, + "loss": 0.0011, + "step": 11110 + }, + { + "epoch": 47.005213675213675, + "grad_norm": 0.0014583432348445058, + "learning_rate": 5.830959164292498e-06, + "loss": 0.0002, + "step": 11120 + }, + { + "epoch": 47.005641025641026, + "grad_norm": 0.000818538770545274, + "learning_rate": 5.826210826210826e-06, + "loss": 0.0164, + "step": 11130 + }, + { + "epoch": 47.00606837606838, + "grad_norm": 0.025977041572332382, + "learning_rate": 5.821462488129155e-06, + "loss": 0.0002, + "step": 11140 + }, + { + "epoch": 47.00649572649573, + "grad_norm": 0.0008319366024807096, + "learning_rate": 5.816714150047484e-06, + "loss": 0.0003, + "step": 11150 + }, + { + "epoch": 47.00692307692308, + "grad_norm": 0.007519662380218506, + "learning_rate": 5.8119658119658126e-06, + "loss": 0.0643, + "step": 11160 + }, + { + "epoch": 47.007350427350424, + "grad_norm": 0.14387542009353638, + "learning_rate": 5.807217473884142e-06, + "loss": 0.0002, + "step": 11170 + }, + { + "epoch": 47.007777777777775, + "grad_norm": 0.0030812029726803303, + "learning_rate": 5.80246913580247e-06, + "loss": 0.0001, + "step": 11180 + }, + { + "epoch": 47.00820512820513, + "grad_norm": 0.0011086307931691408, + "learning_rate": 5.797720797720798e-06, + "loss": 0.3969, + "step": 11190 + }, + { + "epoch": 47.00863247863248, + "grad_norm": 0.0029646113980561495, + "learning_rate": 5.792972459639127e-06, + "loss": 0.0001, + "step": 11200 + }, + { + "epoch": 47.00905982905983, + "grad_norm": 0.003931617829948664, + "learning_rate": 5.788224121557455e-06, + "loss": 0.7591, + "step": 11210 + }, + { + "epoch": 47.00948717948718, + "grad_norm": 0.006124768406152725, + "learning_rate": 5.783475783475784e-06, + "loss": 0.2924, + "step": 11220 + }, + { + "epoch": 47.00991452991453, + "grad_norm": 0.0010141300735995173, + "learning_rate": 5.778727445394112e-06, + "loss": 0.0001, + "step": 11230 + }, + { + "epoch": 47.01, + "eval_accuracy": 0.48, + "eval_loss": 4.412927627563477, + "eval_runtime": 33.7429, + "eval_samples_per_second": 0.741, + "eval_steps_per_second": 0.741, + "step": 11232 + }, + { + "epoch": 48.00034188034188, + "grad_norm": 0.007142414338886738, + "learning_rate": 5.773979107312441e-06, + "loss": 0.8418, + "step": 11240 + }, + { + "epoch": 48.00076923076923, + "grad_norm": 1.3151068687438965, + "learning_rate": 5.769230769230769e-06, + "loss": 1.6759, + "step": 11250 + }, + { + "epoch": 48.00119658119658, + "grad_norm": 0.0036189991515129805, + "learning_rate": 5.764482431149099e-06, + "loss": 0.0004, + "step": 11260 + }, + { + "epoch": 48.00162393162393, + "grad_norm": 0.000717452319804579, + "learning_rate": 5.759734093067427e-06, + "loss": 0.0001, + "step": 11270 + }, + { + "epoch": 48.00205128205128, + "grad_norm": 0.07057972997426987, + "learning_rate": 5.7549857549857555e-06, + "loss": 0.0422, + "step": 11280 + }, + { + "epoch": 48.002478632478635, + "grad_norm": 589.4522094726562, + "learning_rate": 5.750237416904084e-06, + "loss": 0.0541, + "step": 11290 + }, + { + "epoch": 48.002905982905986, + "grad_norm": 0.01362372562289238, + "learning_rate": 5.745489078822413e-06, + "loss": 0.0055, + "step": 11300 + }, + { + "epoch": 48.00333333333333, + "grad_norm": 0.01434069313108921, + "learning_rate": 5.740740740740741e-06, + "loss": 0.7408, + "step": 11310 + }, + { + "epoch": 48.00376068376068, + "grad_norm": 0.0020310671534389257, + "learning_rate": 5.7359924026590695e-06, + "loss": 0.0002, + "step": 11320 + }, + { + "epoch": 48.00418803418803, + "grad_norm": 0.005610823631286621, + "learning_rate": 5.731244064577398e-06, + "loss": 0.0002, + "step": 11330 + }, + { + "epoch": 48.004615384615384, + "grad_norm": 0.0008401147206313908, + "learning_rate": 5.726495726495727e-06, + "loss": 0.0001, + "step": 11340 + }, + { + "epoch": 48.005042735042736, + "grad_norm": 0.03286376968026161, + "learning_rate": 5.721747388414055e-06, + "loss": 0.0007, + "step": 11350 + }, + { + "epoch": 48.00547008547009, + "grad_norm": 0.002525913529098034, + "learning_rate": 5.716999050332384e-06, + "loss": 0.0003, + "step": 11360 + }, + { + "epoch": 48.00589743589744, + "grad_norm": 0.0011132374638691545, + "learning_rate": 5.7122507122507136e-06, + "loss": 0.0, + "step": 11370 + }, + { + "epoch": 48.00632478632479, + "grad_norm": 0.0005615410045720637, + "learning_rate": 5.707502374169042e-06, + "loss": 0.0002, + "step": 11380 + }, + { + "epoch": 48.006752136752134, + "grad_norm": 0.006012198980897665, + "learning_rate": 5.70275403608737e-06, + "loss": 0.0007, + "step": 11390 + }, + { + "epoch": 48.007179487179485, + "grad_norm": 0.014253268949687481, + "learning_rate": 5.6980056980056985e-06, + "loss": 0.0001, + "step": 11400 + }, + { + "epoch": 48.007606837606836, + "grad_norm": 0.001018693670630455, + "learning_rate": 5.693257359924027e-06, + "loss": 0.0002, + "step": 11410 + }, + { + "epoch": 48.00803418803419, + "grad_norm": 0.030744800344109535, + "learning_rate": 5.688509021842356e-06, + "loss": 0.6403, + "step": 11420 + }, + { + "epoch": 48.00846153846154, + "grad_norm": 0.013697023503482342, + "learning_rate": 5.683760683760684e-06, + "loss": 0.0001, + "step": 11430 + }, + { + "epoch": 48.00888888888889, + "grad_norm": 0.01648533344268799, + "learning_rate": 5.6790123456790125e-06, + "loss": 0.0002, + "step": 11440 + }, + { + "epoch": 48.00931623931624, + "grad_norm": 0.0017602647421881557, + "learning_rate": 5.674264007597341e-06, + "loss": 0.0014, + "step": 11450 + }, + { + "epoch": 48.00974358974359, + "grad_norm": 0.0007377556175924838, + "learning_rate": 5.669515669515669e-06, + "loss": 0.0001, + "step": 11460 + }, + { + "epoch": 48.01, + "eval_accuracy": 0.44, + "eval_loss": 5.5987443923950195, + "eval_runtime": 33.7645, + "eval_samples_per_second": 0.74, + "eval_steps_per_second": 0.74, + "step": 11466 + }, + { + "epoch": 49.00017094017094, + "grad_norm": 0.0007492690929211676, + "learning_rate": 5.664767331433999e-06, + "loss": 0.6818, + "step": 11470 + }, + { + "epoch": 49.00059829059829, + "grad_norm": 0.03045983612537384, + "learning_rate": 5.660018993352327e-06, + "loss": 0.5287, + "step": 11480 + }, + { + "epoch": 49.00102564102564, + "grad_norm": 0.0005993829108774662, + "learning_rate": 5.655270655270656e-06, + "loss": 0.0425, + "step": 11490 + }, + { + "epoch": 49.00145299145299, + "grad_norm": 0.0005655125132761896, + "learning_rate": 5.650522317188985e-06, + "loss": 0.0001, + "step": 11500 + }, + { + "epoch": 49.001880341880344, + "grad_norm": 0.000584998691920191, + "learning_rate": 5.645773979107313e-06, + "loss": 0.0001, + "step": 11510 + }, + { + "epoch": 49.002307692307696, + "grad_norm": 0.0036167309153825045, + "learning_rate": 5.641025641025641e-06, + "loss": 0.0019, + "step": 11520 + }, + { + "epoch": 49.00273504273504, + "grad_norm": 0.0014566316967830062, + "learning_rate": 5.63627730294397e-06, + "loss": 0.0002, + "step": 11530 + }, + { + "epoch": 49.00316239316239, + "grad_norm": 194.14332580566406, + "learning_rate": 5.631528964862298e-06, + "loss": 0.8389, + "step": 11540 + }, + { + "epoch": 49.00358974358974, + "grad_norm": 0.03927035257220268, + "learning_rate": 5.626780626780627e-06, + "loss": 0.0001, + "step": 11550 + }, + { + "epoch": 49.004017094017094, + "grad_norm": 0.01512466836720705, + "learning_rate": 5.6220322886989554e-06, + "loss": 0.0011, + "step": 11560 + }, + { + "epoch": 49.004444444444445, + "grad_norm": 0.0020757820457220078, + "learning_rate": 5.617283950617285e-06, + "loss": 0.3843, + "step": 11570 + }, + { + "epoch": 49.0048717948718, + "grad_norm": 0.0007370910025201738, + "learning_rate": 5.612535612535614e-06, + "loss": 0.0001, + "step": 11580 + }, + { + "epoch": 49.00529914529915, + "grad_norm": 0.0047119674272835255, + "learning_rate": 5.607787274453942e-06, + "loss": 0.4626, + "step": 11590 + }, + { + "epoch": 49.0057264957265, + "grad_norm": 0.0026911916211247444, + "learning_rate": 5.60303893637227e-06, + "loss": 0.0649, + "step": 11600 + }, + { + "epoch": 49.00615384615384, + "grad_norm": 0.02297067642211914, + "learning_rate": 5.598290598290599e-06, + "loss": 0.0036, + "step": 11610 + }, + { + "epoch": 49.006581196581195, + "grad_norm": 0.0033419374376535416, + "learning_rate": 5.593542260208927e-06, + "loss": 0.0001, + "step": 11620 + }, + { + "epoch": 49.007008547008546, + "grad_norm": 0.08777555078268051, + "learning_rate": 5.588793922127256e-06, + "loss": 0.95, + "step": 11630 + }, + { + "epoch": 49.0074358974359, + "grad_norm": 0.0006313354824669659, + "learning_rate": 5.584045584045584e-06, + "loss": 0.6174, + "step": 11640 + }, + { + "epoch": 49.00786324786325, + "grad_norm": 0.0009615861345082521, + "learning_rate": 5.579297245963913e-06, + "loss": 0.0004, + "step": 11650 + }, + { + "epoch": 49.0082905982906, + "grad_norm": 0.04878242313861847, + "learning_rate": 5.574548907882241e-06, + "loss": 0.0001, + "step": 11660 + }, + { + "epoch": 49.00871794871795, + "grad_norm": 0.0008145536412484944, + "learning_rate": 5.56980056980057e-06, + "loss": 0.0002, + "step": 11670 + }, + { + "epoch": 49.0091452991453, + "grad_norm": 0.0008402611128985882, + "learning_rate": 5.565052231718899e-06, + "loss": 0.0, + "step": 11680 + }, + { + "epoch": 49.00957264957265, + "grad_norm": 0.17107157409191132, + "learning_rate": 5.5603038936372275e-06, + "loss": 0.044, + "step": 11690 + }, + { + "epoch": 49.01, + "grad_norm": 0.007154208142310381, + "learning_rate": 5.555555555555557e-06, + "loss": 0.0003, + "step": 11700 + }, + { + "epoch": 49.01, + "eval_accuracy": 0.44, + "eval_loss": 5.4567461013793945, + "eval_runtime": 33.7217, + "eval_samples_per_second": 0.741, + "eval_steps_per_second": 0.741, + "step": 11700 + }, + { + "epoch": 50.00042735042735, + "grad_norm": 0.0006979930330999196, + "learning_rate": 5.550807217473885e-06, + "loss": 0.0, + "step": 11710 + }, + { + "epoch": 50.0008547008547, + "grad_norm": 0.005530244205147028, + "learning_rate": 5.546058879392213e-06, + "loss": 0.0002, + "step": 11720 + }, + { + "epoch": 50.001282051282054, + "grad_norm": 0.007461980450898409, + "learning_rate": 5.5413105413105416e-06, + "loss": 0.0723, + "step": 11730 + }, + { + "epoch": 50.001709401709405, + "grad_norm": 0.000708905456122011, + "learning_rate": 5.53656220322887e-06, + "loss": 0.0008, + "step": 11740 + }, + { + "epoch": 50.00213675213675, + "grad_norm": 0.0005582983139902353, + "learning_rate": 5.531813865147199e-06, + "loss": 0.2155, + "step": 11750 + }, + { + "epoch": 50.0025641025641, + "grad_norm": 0.001401066081598401, + "learning_rate": 5.527065527065527e-06, + "loss": 0.0042, + "step": 11760 + }, + { + "epoch": 50.00299145299145, + "grad_norm": 0.0036240883637219667, + "learning_rate": 5.522317188983856e-06, + "loss": 0.5656, + "step": 11770 + }, + { + "epoch": 50.0034188034188, + "grad_norm": 0.0006863917224109173, + "learning_rate": 5.517568850902186e-06, + "loss": 0.0002, + "step": 11780 + }, + { + "epoch": 50.003846153846155, + "grad_norm": 0.0006204345263540745, + "learning_rate": 5.512820512820514e-06, + "loss": 0.0001, + "step": 11790 + }, + { + "epoch": 50.004273504273506, + "grad_norm": 0.0025562657974660397, + "learning_rate": 5.508072174738842e-06, + "loss": 0.0004, + "step": 11800 + }, + { + "epoch": 50.00470085470086, + "grad_norm": 0.0017411591252312064, + "learning_rate": 5.5033238366571705e-06, + "loss": 0.0002, + "step": 11810 + }, + { + "epoch": 50.00512820512821, + "grad_norm": 23.52789306640625, + "learning_rate": 5.498575498575499e-06, + "loss": 0.0036, + "step": 11820 + }, + { + "epoch": 50.00555555555555, + "grad_norm": 0.0005795454489998519, + "learning_rate": 5.493827160493828e-06, + "loss": 0.7016, + "step": 11830 + }, + { + "epoch": 50.005982905982904, + "grad_norm": 0.0019489077385514975, + "learning_rate": 5.489078822412156e-06, + "loss": 0.0001, + "step": 11840 + }, + { + "epoch": 50.006410256410255, + "grad_norm": 0.0617222860455513, + "learning_rate": 5.4843304843304845e-06, + "loss": 0.0001, + "step": 11850 + }, + { + "epoch": 50.00683760683761, + "grad_norm": 403.005859375, + "learning_rate": 5.479582146248813e-06, + "loss": 0.1644, + "step": 11860 + }, + { + "epoch": 50.00726495726496, + "grad_norm": 0.14567570388317108, + "learning_rate": 5.474833808167141e-06, + "loss": 0.0004, + "step": 11870 + }, + { + "epoch": 50.00769230769231, + "grad_norm": 0.0005991229554638267, + "learning_rate": 5.470085470085471e-06, + "loss": 0.0898, + "step": 11880 + }, + { + "epoch": 50.00811965811966, + "grad_norm": 0.14940251410007477, + "learning_rate": 5.465337132003799e-06, + "loss": 0.0001, + "step": 11890 + }, + { + "epoch": 50.00854700854701, + "grad_norm": 0.028690291568636894, + "learning_rate": 5.460588793922128e-06, + "loss": 0.0023, + "step": 11900 + }, + { + "epoch": 50.008974358974356, + "grad_norm": 0.21514463424682617, + "learning_rate": 5.455840455840457e-06, + "loss": 0.6174, + "step": 11910 + }, + { + "epoch": 50.00940170940171, + "grad_norm": 0.0008885301067493856, + "learning_rate": 5.451092117758785e-06, + "loss": 0.0001, + "step": 11920 + }, + { + "epoch": 50.00982905982906, + "grad_norm": 0.0035741357132792473, + "learning_rate": 5.446343779677113e-06, + "loss": 0.0468, + "step": 11930 + }, + { + "epoch": 50.01, + "eval_accuracy": 0.48, + "eval_loss": 5.021820545196533, + "eval_runtime": 33.527, + "eval_samples_per_second": 0.746, + "eval_steps_per_second": 0.746, + "step": 11934 + }, + { + "epoch": 51.00025641025641, + "grad_norm": 0.11837821453809738, + "learning_rate": 5.441595441595442e-06, + "loss": 0.0001, + "step": 11940 + }, + { + "epoch": 51.00068376068376, + "grad_norm": 32.09723663330078, + "learning_rate": 5.43684710351377e-06, + "loss": 0.0062, + "step": 11950 + }, + { + "epoch": 51.00111111111111, + "grad_norm": 0.0008311708807013929, + "learning_rate": 5.432098765432099e-06, + "loss": 0.4382, + "step": 11960 + }, + { + "epoch": 51.00153846153846, + "grad_norm": 0.00047426484525203705, + "learning_rate": 5.4273504273504275e-06, + "loss": 0.0019, + "step": 11970 + }, + { + "epoch": 51.00196581196581, + "grad_norm": 0.0012167677050456405, + "learning_rate": 5.422602089268756e-06, + "loss": 0.0002, + "step": 11980 + }, + { + "epoch": 51.00239316239316, + "grad_norm": 0.0028268792666494846, + "learning_rate": 5.417853751187086e-06, + "loss": 0.0001, + "step": 11990 + }, + { + "epoch": 51.00282051282051, + "grad_norm": 0.0009294459596276283, + "learning_rate": 5.413105413105414e-06, + "loss": 0.0001, + "step": 12000 + }, + { + "epoch": 51.003247863247864, + "grad_norm": 0.014754371717572212, + "learning_rate": 5.408357075023742e-06, + "loss": 0.0001, + "step": 12010 + }, + { + "epoch": 51.003675213675216, + "grad_norm": 0.027568388730287552, + "learning_rate": 5.403608736942071e-06, + "loss": 0.0002, + "step": 12020 + }, + { + "epoch": 51.00410256410257, + "grad_norm": 0.07785722613334656, + "learning_rate": 5.3988603988604e-06, + "loss": 0.0001, + "step": 12030 + }, + { + "epoch": 51.00452991452991, + "grad_norm": 0.0007142764516174793, + "learning_rate": 5.394112060778728e-06, + "loss": 0.0003, + "step": 12040 + }, + { + "epoch": 51.00495726495726, + "grad_norm": 0.002452197717502713, + "learning_rate": 5.389363722697056e-06, + "loss": 0.0003, + "step": 12050 + }, + { + "epoch": 51.005384615384614, + "grad_norm": 0.0013394391862675548, + "learning_rate": 5.384615384615385e-06, + "loss": 0.0001, + "step": 12060 + }, + { + "epoch": 51.005811965811965, + "grad_norm": 0.0005540793645195663, + "learning_rate": 5.379867046533713e-06, + "loss": 0.0, + "step": 12070 + }, + { + "epoch": 51.006239316239316, + "grad_norm": 0.004768889397382736, + "learning_rate": 5.375118708452042e-06, + "loss": 0.0002, + "step": 12080 + }, + { + "epoch": 51.00666666666667, + "grad_norm": 0.0035481529776006937, + "learning_rate": 5.370370370370371e-06, + "loss": 0.004, + "step": 12090 + }, + { + "epoch": 51.00709401709402, + "grad_norm": 0.0004956016782671213, + "learning_rate": 5.3656220322886995e-06, + "loss": 0.0001, + "step": 12100 + }, + { + "epoch": 51.00752136752137, + "grad_norm": 0.0010804428020492196, + "learning_rate": 5.360873694207029e-06, + "loss": 0.0001, + "step": 12110 + }, + { + "epoch": 51.007948717948715, + "grad_norm": 0.004500087816268206, + "learning_rate": 5.356125356125357e-06, + "loss": 0.031, + "step": 12120 + }, + { + "epoch": 51.008376068376066, + "grad_norm": 0.0010846515651792288, + "learning_rate": 5.351377018043685e-06, + "loss": 1.3764, + "step": 12130 + }, + { + "epoch": 51.00880341880342, + "grad_norm": 0.0024583388585597277, + "learning_rate": 5.346628679962014e-06, + "loss": 0.0, + "step": 12140 + }, + { + "epoch": 51.00923076923077, + "grad_norm": 0.000765733711887151, + "learning_rate": 5.341880341880342e-06, + "loss": 0.0134, + "step": 12150 + }, + { + "epoch": 51.00965811965812, + "grad_norm": 0.0004111463204026222, + "learning_rate": 5.337132003798671e-06, + "loss": 0.187, + "step": 12160 + }, + { + "epoch": 51.01, + "eval_accuracy": 0.4, + "eval_loss": 5.326884746551514, + "eval_runtime": 33.7105, + "eval_samples_per_second": 0.742, + "eval_steps_per_second": 0.742, + "step": 12168 + }, + { + "epoch": 52.00008547008547, + "grad_norm": 0.0006366129964590073, + "learning_rate": 5.332383665716999e-06, + "loss": 0.2729, + "step": 12170 + }, + { + "epoch": 52.00051282051282, + "grad_norm": 0.00045478667016141117, + "learning_rate": 5.327635327635328e-06, + "loss": 0.0002, + "step": 12180 + }, + { + "epoch": 52.00094017094017, + "grad_norm": 0.0012099344749003649, + "learning_rate": 5.322886989553656e-06, + "loss": 0.0017, + "step": 12190 + }, + { + "epoch": 52.00136752136752, + "grad_norm": 0.010483755730092525, + "learning_rate": 5.318138651471986e-06, + "loss": 0.0001, + "step": 12200 + }, + { + "epoch": 52.00179487179487, + "grad_norm": 0.001030059065669775, + "learning_rate": 5.313390313390314e-06, + "loss": 0.0001, + "step": 12210 + }, + { + "epoch": 52.00222222222222, + "grad_norm": 0.005780795589089394, + "learning_rate": 5.3086419753086425e-06, + "loss": 0.0001, + "step": 12220 + }, + { + "epoch": 52.002649572649574, + "grad_norm": 0.001182667212560773, + "learning_rate": 5.303893637226971e-06, + "loss": 0.7791, + "step": 12230 + }, + { + "epoch": 52.003076923076925, + "grad_norm": 0.10405033081769943, + "learning_rate": 5.2991452991453e-06, + "loss": 0.172, + "step": 12240 + }, + { + "epoch": 52.00350427350428, + "grad_norm": 0.0005188258364796638, + "learning_rate": 5.294396961063628e-06, + "loss": 0.0001, + "step": 12250 + }, + { + "epoch": 52.00393162393162, + "grad_norm": 0.0017536180093884468, + "learning_rate": 5.2896486229819565e-06, + "loss": 0.0001, + "step": 12260 + }, + { + "epoch": 52.00435897435897, + "grad_norm": 0.0009690941660664976, + "learning_rate": 5.284900284900285e-06, + "loss": 0.0001, + "step": 12270 + }, + { + "epoch": 52.00478632478632, + "grad_norm": 0.0005580552387982607, + "learning_rate": 5.280151946818613e-06, + "loss": 0.0003, + "step": 12280 + }, + { + "epoch": 52.005213675213675, + "grad_norm": 0.1803247481584549, + "learning_rate": 5.275403608736942e-06, + "loss": 0.0001, + "step": 12290 + }, + { + "epoch": 52.005641025641026, + "grad_norm": 0.005001471843570471, + "learning_rate": 5.270655270655271e-06, + "loss": 0.0002, + "step": 12300 + }, + { + "epoch": 52.00606837606838, + "grad_norm": 0.004487255588173866, + "learning_rate": 5.2659069325736e-06, + "loss": 0.5693, + "step": 12310 + }, + { + "epoch": 52.00649572649573, + "grad_norm": 612.653564453125, + "learning_rate": 5.261158594491929e-06, + "loss": 0.7203, + "step": 12320 + }, + { + "epoch": 52.00692307692308, + "grad_norm": 0.0006710452144034207, + "learning_rate": 5.256410256410257e-06, + "loss": 0.0004, + "step": 12330 + }, + { + "epoch": 52.007350427350424, + "grad_norm": 0.0005012887413613498, + "learning_rate": 5.2516619183285854e-06, + "loss": 0.5105, + "step": 12340 + }, + { + "epoch": 52.007777777777775, + "grad_norm": 0.002993144793435931, + "learning_rate": 5.246913580246914e-06, + "loss": 0.8932, + "step": 12350 + }, + { + "epoch": 52.00820512820513, + "grad_norm": 0.2561093270778656, + "learning_rate": 5.242165242165243e-06, + "loss": 0.0001, + "step": 12360 + }, + { + "epoch": 52.00863247863248, + "grad_norm": 0.003413307713344693, + "learning_rate": 5.237416904083571e-06, + "loss": 0.771, + "step": 12370 + }, + { + "epoch": 52.00905982905983, + "grad_norm": 0.000699891708791256, + "learning_rate": 5.2326685660018995e-06, + "loss": 0.8725, + "step": 12380 + }, + { + "epoch": 52.00948717948718, + "grad_norm": 0.0005549703491851687, + "learning_rate": 5.227920227920228e-06, + "loss": 0.0001, + "step": 12390 + }, + { + "epoch": 52.00991452991453, + "grad_norm": 0.02157350443303585, + "learning_rate": 5.223171889838556e-06, + "loss": 0.0002, + "step": 12400 + }, + { + "epoch": 52.01, + "eval_accuracy": 0.44, + "eval_loss": 5.436356067657471, + "eval_runtime": 33.5444, + "eval_samples_per_second": 0.745, + "eval_steps_per_second": 0.745, + "step": 12402 + }, + { + "epoch": 53.00034188034188, + "grad_norm": 0.0005383410025388002, + "learning_rate": 5.218423551756886e-06, + "loss": 0.0001, + "step": 12410 + }, + { + "epoch": 53.00076923076923, + "grad_norm": 0.0005392262828536332, + "learning_rate": 5.213675213675214e-06, + "loss": 0.0001, + "step": 12420 + }, + { + "epoch": 53.00119658119658, + "grad_norm": 3.4017317295074463, + "learning_rate": 5.208926875593543e-06, + "loss": 0.8486, + "step": 12430 + }, + { + "epoch": 53.00162393162393, + "grad_norm": 0.0005204902845434844, + "learning_rate": 5.204178537511872e-06, + "loss": 0.7513, + "step": 12440 + }, + { + "epoch": 53.00205128205128, + "grad_norm": 0.05535678565502167, + "learning_rate": 5.1994301994302e-06, + "loss": 0.0001, + "step": 12450 + }, + { + "epoch": 53.002478632478635, + "grad_norm": 0.002707291394472122, + "learning_rate": 5.194681861348528e-06, + "loss": 0.0001, + "step": 12460 + }, + { + "epoch": 53.002905982905986, + "grad_norm": 0.000867334078066051, + "learning_rate": 5.189933523266857e-06, + "loss": 0.0008, + "step": 12470 + }, + { + "epoch": 53.00333333333333, + "grad_norm": 0.0019842975307255983, + "learning_rate": 5.185185185185185e-06, + "loss": 0.0007, + "step": 12480 + }, + { + "epoch": 53.00376068376068, + "grad_norm": 0.002031872980296612, + "learning_rate": 5.180436847103514e-06, + "loss": 0.0013, + "step": 12490 + }, + { + "epoch": 53.00418803418803, + "grad_norm": 0.8920242786407471, + "learning_rate": 5.175688509021842e-06, + "loss": 0.0004, + "step": 12500 + }, + { + "epoch": 53.004615384615384, + "grad_norm": 0.005432880017906427, + "learning_rate": 5.1709401709401716e-06, + "loss": 0.0011, + "step": 12510 + }, + { + "epoch": 53.005042735042736, + "grad_norm": 0.0005356788169592619, + "learning_rate": 5.166191832858501e-06, + "loss": 0.1904, + "step": 12520 + }, + { + "epoch": 53.00547008547009, + "grad_norm": 0.00788265373557806, + "learning_rate": 5.161443494776829e-06, + "loss": 0.0093, + "step": 12530 + }, + { + "epoch": 53.00589743589744, + "grad_norm": 0.0006382990977726877, + "learning_rate": 5.156695156695157e-06, + "loss": 0.0003, + "step": 12540 + }, + { + "epoch": 53.00632478632479, + "grad_norm": 0.0006224510143510997, + "learning_rate": 5.151946818613486e-06, + "loss": 0.012, + "step": 12550 + }, + { + "epoch": 53.006752136752134, + "grad_norm": 0.00042305790702812374, + "learning_rate": 5.147198480531814e-06, + "loss": 0.0001, + "step": 12560 + }, + { + "epoch": 53.007179487179485, + "grad_norm": 0.00042074365774169564, + "learning_rate": 5.142450142450143e-06, + "loss": 0.0009, + "step": 12570 + }, + { + "epoch": 53.007606837606836, + "grad_norm": 0.00039612961700186133, + "learning_rate": 5.137701804368471e-06, + "loss": 0.0001, + "step": 12580 + }, + { + "epoch": 53.00803418803419, + "grad_norm": 0.029377451166510582, + "learning_rate": 5.1329534662868e-06, + "loss": 0.0, + "step": 12590 + }, + { + "epoch": 53.00846153846154, + "grad_norm": 0.0005169266369193792, + "learning_rate": 5.128205128205128e-06, + "loss": 0.0001, + "step": 12600 + }, + { + "epoch": 53.00888888888889, + "grad_norm": 0.007242798339575529, + "learning_rate": 5.123456790123458e-06, + "loss": 0.0, + "step": 12610 + }, + { + "epoch": 53.00931623931624, + "grad_norm": 0.01810172200202942, + "learning_rate": 5.118708452041786e-06, + "loss": 0.0055, + "step": 12620 + }, + { + "epoch": 53.00974358974359, + "grad_norm": 0.00594985531643033, + "learning_rate": 5.1139601139601145e-06, + "loss": 0.0001, + "step": 12630 + }, + { + "epoch": 53.01, + "eval_accuracy": 0.44, + "eval_loss": 5.730712890625, + "eval_runtime": 33.9799, + "eval_samples_per_second": 0.736, + "eval_steps_per_second": 0.736, + "step": 12636 + }, + { + "epoch": 54.00017094017094, + "grad_norm": 0.0023435179609805346, + "learning_rate": 5.109211775878443e-06, + "loss": 0.0001, + "step": 12640 + }, + { + "epoch": 54.00059829059829, + "grad_norm": 8.973128318786621, + "learning_rate": 5.104463437796772e-06, + "loss": 0.0012, + "step": 12650 + }, + { + "epoch": 54.00102564102564, + "grad_norm": 0.008565773256123066, + "learning_rate": 5.0997150997151e-06, + "loss": 0.0003, + "step": 12660 + }, + { + "epoch": 54.00145299145299, + "grad_norm": 0.0006859298446215689, + "learning_rate": 5.0949667616334285e-06, + "loss": 0.0, + "step": 12670 + }, + { + "epoch": 54.001880341880344, + "grad_norm": 0.0004111983289476484, + "learning_rate": 5.090218423551757e-06, + "loss": 0.0001, + "step": 12680 + }, + { + "epoch": 54.002307692307696, + "grad_norm": 0.0004719541466329247, + "learning_rate": 5.085470085470086e-06, + "loss": 0.5531, + "step": 12690 + }, + { + "epoch": 54.00273504273504, + "grad_norm": 0.004437173251062632, + "learning_rate": 5.080721747388414e-06, + "loss": 0.006, + "step": 12700 + }, + { + "epoch": 54.00316239316239, + "grad_norm": 0.0022185053676366806, + "learning_rate": 5.075973409306743e-06, + "loss": 0.0003, + "step": 12710 + }, + { + "epoch": 54.00358974358974, + "grad_norm": 0.0506693460047245, + "learning_rate": 5.071225071225072e-06, + "loss": 0.0001, + "step": 12720 + }, + { + "epoch": 54.004017094017094, + "grad_norm": 0.0014226617058739066, + "learning_rate": 5.066476733143401e-06, + "loss": 0.5765, + "step": 12730 + }, + { + "epoch": 54.004444444444445, + "grad_norm": 0.0005160128348506987, + "learning_rate": 5.061728395061729e-06, + "loss": 0.0, + "step": 12740 + }, + { + "epoch": 54.0048717948718, + "grad_norm": 0.0004296216939110309, + "learning_rate": 5.0569800569800575e-06, + "loss": 0.3456, + "step": 12750 + }, + { + "epoch": 54.00529914529915, + "grad_norm": 0.0012724578846246004, + "learning_rate": 5.052231718898386e-06, + "loss": 0.072, + "step": 12760 + }, + { + "epoch": 54.0057264957265, + "grad_norm": 0.0030480227433145046, + "learning_rate": 5.047483380816715e-06, + "loss": 0.0001, + "step": 12770 + }, + { + "epoch": 54.00615384615384, + "grad_norm": 0.0005519132246263325, + "learning_rate": 5.042735042735043e-06, + "loss": 0.0, + "step": 12780 + }, + { + "epoch": 54.006581196581195, + "grad_norm": 0.0007735707913525403, + "learning_rate": 5.0379867046533715e-06, + "loss": 0.4349, + "step": 12790 + }, + { + "epoch": 54.007008547008546, + "grad_norm": 0.0024715519975870848, + "learning_rate": 5.0332383665717e-06, + "loss": 0.9521, + "step": 12800 + }, + { + "epoch": 54.0074358974359, + "grad_norm": 0.005817775148898363, + "learning_rate": 5.028490028490028e-06, + "loss": 0.0001, + "step": 12810 + }, + { + "epoch": 54.00786324786325, + "grad_norm": 0.0017352089053019881, + "learning_rate": 5.023741690408358e-06, + "loss": 0.0004, + "step": 12820 + }, + { + "epoch": 54.0082905982906, + "grad_norm": 0.0010756135452538729, + "learning_rate": 5.018993352326686e-06, + "loss": 0.0001, + "step": 12830 + }, + { + "epoch": 54.00871794871795, + "grad_norm": 0.003498775651678443, + "learning_rate": 5.014245014245015e-06, + "loss": 0.0003, + "step": 12840 + }, + { + "epoch": 54.0091452991453, + "grad_norm": 0.010095912963151932, + "learning_rate": 5.009496676163344e-06, + "loss": 0.017, + "step": 12850 + }, + { + "epoch": 54.00957264957265, + "grad_norm": 0.00034551063436083496, + "learning_rate": 5.004748338081672e-06, + "loss": 0.0026, + "step": 12860 + }, + { + "epoch": 54.01, + "grad_norm": 0.0008016827632673085, + "learning_rate": 5e-06, + "loss": 0.0, + "step": 12870 + }, + { + "epoch": 54.01, + "eval_accuracy": 0.44, + "eval_loss": 5.978097915649414, + "eval_runtime": 33.5999, + "eval_samples_per_second": 0.744, + "eval_steps_per_second": 0.744, + "step": 12870 + }, + { + "epoch": 55.00042735042735, + "grad_norm": 0.0008040638058446348, + "learning_rate": 4.995251661918329e-06, + "loss": 0.0, + "step": 12880 + }, + { + "epoch": 55.0008547008547, + "grad_norm": 0.0010009427787736058, + "learning_rate": 4.990503323836657e-06, + "loss": 0.0243, + "step": 12890 + }, + { + "epoch": 55.001282051282054, + "grad_norm": 0.6322616934776306, + "learning_rate": 4.985754985754986e-06, + "loss": 0.0002, + "step": 12900 + }, + { + "epoch": 55.001709401709405, + "grad_norm": 0.00035148794995620847, + "learning_rate": 4.981006647673315e-06, + "loss": 0.0001, + "step": 12910 + }, + { + "epoch": 55.00213675213675, + "grad_norm": 0.0009499528096057475, + "learning_rate": 4.976258309591644e-06, + "loss": 0.158, + "step": 12920 + }, + { + "epoch": 55.0025641025641, + "grad_norm": 1.7123631238937378, + "learning_rate": 4.971509971509972e-06, + "loss": 0.0004, + "step": 12930 + }, + { + "epoch": 55.00299145299145, + "grad_norm": 0.00044161363621242344, + "learning_rate": 4.9667616334283e-06, + "loss": 0.0001, + "step": 12940 + }, + { + "epoch": 55.0034188034188, + "grad_norm": 0.0013492783764377236, + "learning_rate": 4.962013295346629e-06, + "loss": 0.4795, + "step": 12950 + }, + { + "epoch": 55.003846153846155, + "grad_norm": 0.0004561662208288908, + "learning_rate": 4.957264957264958e-06, + "loss": 0.0001, + "step": 12960 + }, + { + "epoch": 55.004273504273506, + "grad_norm": 0.00044428068213164806, + "learning_rate": 4.952516619183286e-06, + "loss": 0.001, + "step": 12970 + }, + { + "epoch": 55.00470085470086, + "grad_norm": 0.0006210142164491117, + "learning_rate": 4.947768281101615e-06, + "loss": 0.0001, + "step": 12980 + }, + { + "epoch": 55.00512820512821, + "grad_norm": 0.0037781489081680775, + "learning_rate": 4.943019943019943e-06, + "loss": 0.0, + "step": 12990 + }, + { + "epoch": 55.00555555555555, + "grad_norm": 0.0003753666242118925, + "learning_rate": 4.938271604938272e-06, + "loss": 0.0001, + "step": 13000 + }, + { + "epoch": 55.005982905982904, + "grad_norm": 0.009278559125959873, + "learning_rate": 4.933523266856601e-06, + "loss": 0.0001, + "step": 13010 + }, + { + "epoch": 55.006410256410255, + "grad_norm": 1.3523920774459839, + "learning_rate": 4.928774928774929e-06, + "loss": 0.0003, + "step": 13020 + }, + { + "epoch": 55.00683760683761, + "grad_norm": 0.00043029204243794084, + "learning_rate": 4.924026590693257e-06, + "loss": 0.0001, + "step": 13030 + }, + { + "epoch": 55.00726495726496, + "grad_norm": 0.00037467462243512273, + "learning_rate": 4.9192782526115865e-06, + "loss": 0.0001, + "step": 13040 + }, + { + "epoch": 55.00769230769231, + "grad_norm": 0.006767070386558771, + "learning_rate": 4.914529914529915e-06, + "loss": 0.0001, + "step": 13050 + }, + { + "epoch": 55.00811965811966, + "grad_norm": 0.00043661269592121243, + "learning_rate": 4.909781576448244e-06, + "loss": 0.0002, + "step": 13060 + }, + { + "epoch": 55.00854700854701, + "grad_norm": 0.0015735122142359614, + "learning_rate": 4.905033238366572e-06, + "loss": 0.0002, + "step": 13070 + }, + { + "epoch": 55.008974358974356, + "grad_norm": 0.009783231653273106, + "learning_rate": 4.9002849002849006e-06, + "loss": 0.0001, + "step": 13080 + }, + { + "epoch": 55.00940170940171, + "grad_norm": 0.000962352380156517, + "learning_rate": 4.895536562203229e-06, + "loss": 0.9947, + "step": 13090 + }, + { + "epoch": 55.00982905982906, + "grad_norm": 0.0005439156084321439, + "learning_rate": 4.890788224121558e-06, + "loss": 0.0001, + "step": 13100 + }, + { + "epoch": 55.01, + "eval_accuracy": 0.44, + "eval_loss": 4.822087287902832, + "eval_runtime": 33.7892, + "eval_samples_per_second": 0.74, + "eval_steps_per_second": 0.74, + "step": 13104 + }, + { + "epoch": 56.00025641025641, + "grad_norm": 0.010900448076426983, + "learning_rate": 4.886039886039886e-06, + "loss": 0.0001, + "step": 13110 + }, + { + "epoch": 56.00068376068376, + "grad_norm": 0.0009302808903157711, + "learning_rate": 4.8812915479582154e-06, + "loss": 0.0001, + "step": 13120 + }, + { + "epoch": 56.00111111111111, + "grad_norm": 0.0025970793794840574, + "learning_rate": 4.876543209876544e-06, + "loss": 0.582, + "step": 13130 + }, + { + "epoch": 56.00153846153846, + "grad_norm": 0.003855284536257386, + "learning_rate": 4.871794871794872e-06, + "loss": 0.0001, + "step": 13140 + }, + { + "epoch": 56.00196581196581, + "grad_norm": 0.16761374473571777, + "learning_rate": 4.8670465337132e-06, + "loss": 0.7305, + "step": 13150 + }, + { + "epoch": 56.00239316239316, + "grad_norm": 0.01091548427939415, + "learning_rate": 4.8622981956315295e-06, + "loss": 0.0001, + "step": 13160 + }, + { + "epoch": 56.00282051282051, + "grad_norm": 0.0012768160086125135, + "learning_rate": 4.857549857549858e-06, + "loss": 0.0184, + "step": 13170 + }, + { + "epoch": 56.003247863247864, + "grad_norm": 0.0003495585115160793, + "learning_rate": 4.852801519468187e-06, + "loss": 0.0, + "step": 13180 + }, + { + "epoch": 56.003675213675216, + "grad_norm": 0.00034958813921548426, + "learning_rate": 4.848053181386515e-06, + "loss": 0.19, + "step": 13190 + }, + { + "epoch": 56.00410256410257, + "grad_norm": 0.00039932169602252543, + "learning_rate": 4.8433048433048435e-06, + "loss": 0.0, + "step": 13200 + }, + { + "epoch": 56.00452991452991, + "grad_norm": 0.00140125781763345, + "learning_rate": 4.838556505223173e-06, + "loss": 0.4053, + "step": 13210 + }, + { + "epoch": 56.00495726495726, + "grad_norm": 0.00046302462578751147, + "learning_rate": 4.833808167141501e-06, + "loss": 0.1168, + "step": 13220 + }, + { + "epoch": 56.005384615384614, + "grad_norm": 0.07333427667617798, + "learning_rate": 4.829059829059829e-06, + "loss": 0.0, + "step": 13230 + }, + { + "epoch": 56.005811965811965, + "grad_norm": 0.5369700789451599, + "learning_rate": 4.824311490978158e-06, + "loss": 0.0002, + "step": 13240 + }, + { + "epoch": 56.006239316239316, + "grad_norm": 0.0011444406118243933, + "learning_rate": 4.819563152896487e-06, + "loss": 0.0001, + "step": 13250 + }, + { + "epoch": 56.00666666666667, + "grad_norm": 0.005999968387186527, + "learning_rate": 4.814814814814815e-06, + "loss": 0.0003, + "step": 13260 + }, + { + "epoch": 56.00709401709402, + "grad_norm": 0.006480031181126833, + "learning_rate": 4.810066476733144e-06, + "loss": 0.1278, + "step": 13270 + }, + { + "epoch": 56.00752136752137, + "grad_norm": 0.001407441683113575, + "learning_rate": 4.8053181386514724e-06, + "loss": 0.0011, + "step": 13280 + }, + { + "epoch": 56.007948717948715, + "grad_norm": 0.008889264427125454, + "learning_rate": 4.800569800569801e-06, + "loss": 0.0001, + "step": 13290 + }, + { + "epoch": 56.008376068376066, + "grad_norm": 0.0005853257607668638, + "learning_rate": 4.795821462488129e-06, + "loss": 0.0242, + "step": 13300 + }, + { + "epoch": 56.00880341880342, + "grad_norm": 0.0004666523600462824, + "learning_rate": 4.791073124406458e-06, + "loss": 0.0001, + "step": 13310 + }, + { + "epoch": 56.00923076923077, + "grad_norm": 404.7363586425781, + "learning_rate": 4.786324786324787e-06, + "loss": 0.7525, + "step": 13320 + }, + { + "epoch": 56.00965811965812, + "grad_norm": 0.0004823113849852234, + "learning_rate": 4.781576448243116e-06, + "loss": 0.0001, + "step": 13330 + }, + { + "epoch": 56.01, + "eval_accuracy": 0.4, + "eval_loss": 5.58083438873291, + "eval_runtime": 33.7533, + "eval_samples_per_second": 0.741, + "eval_steps_per_second": 0.741, + "step": 13338 + }, + { + "epoch": 57.00008547008547, + "grad_norm": 0.001120713190175593, + "learning_rate": 4.776828110161444e-06, + "loss": 0.0001, + "step": 13340 + }, + { + "epoch": 57.00051282051282, + "grad_norm": 0.0011573611991479993, + "learning_rate": 4.772079772079772e-06, + "loss": 0.0, + "step": 13350 + }, + { + "epoch": 57.00094017094017, + "grad_norm": 0.0003190473944414407, + "learning_rate": 4.7673314339981005e-06, + "loss": 0.0, + "step": 13360 + }, + { + "epoch": 57.00136752136752, + "grad_norm": 0.29808613657951355, + "learning_rate": 4.76258309591643e-06, + "loss": 0.0003, + "step": 13370 + }, + { + "epoch": 57.00179487179487, + "grad_norm": 0.0006945320055820048, + "learning_rate": 4.757834757834758e-06, + "loss": 0.0001, + "step": 13380 + }, + { + "epoch": 57.00222222222222, + "grad_norm": 0.0003567099920473993, + "learning_rate": 4.753086419753087e-06, + "loss": 0.0001, + "step": 13390 + }, + { + "epoch": 57.002649572649574, + "grad_norm": 0.012886996380984783, + "learning_rate": 4.748338081671415e-06, + "loss": 0.0, + "step": 13400 + }, + { + "epoch": 57.003076923076925, + "grad_norm": 0.0008997747790999711, + "learning_rate": 4.743589743589744e-06, + "loss": 0.0001, + "step": 13410 + }, + { + "epoch": 57.00350427350428, + "grad_norm": 0.008139757439494133, + "learning_rate": 4.738841405508073e-06, + "loss": 0.0, + "step": 13420 + }, + { + "epoch": 57.00393162393162, + "grad_norm": 0.008020474575459957, + "learning_rate": 4.734093067426401e-06, + "loss": 0.0001, + "step": 13430 + }, + { + "epoch": 57.00435897435897, + "grad_norm": 0.0004538062203209847, + "learning_rate": 4.729344729344729e-06, + "loss": 0.0001, + "step": 13440 + }, + { + "epoch": 57.00478632478632, + "grad_norm": 0.0063064745627343655, + "learning_rate": 4.7245963912630586e-06, + "loss": 0.0004, + "step": 13450 + }, + { + "epoch": 57.005213675213675, + "grad_norm": 0.001134122721850872, + "learning_rate": 4.719848053181387e-06, + "loss": 0.0, + "step": 13460 + }, + { + "epoch": 57.005641025641026, + "grad_norm": 0.01796836219727993, + "learning_rate": 4.715099715099716e-06, + "loss": 0.0001, + "step": 13470 + }, + { + "epoch": 57.00606837606838, + "grad_norm": 0.00409694854170084, + "learning_rate": 4.710351377018044e-06, + "loss": 0.0002, + "step": 13480 + }, + { + "epoch": 57.00649572649573, + "grad_norm": 0.00029308826196938753, + "learning_rate": 4.705603038936373e-06, + "loss": 0.0033, + "step": 13490 + }, + { + "epoch": 57.00692307692308, + "grad_norm": 0.0007244080770760775, + "learning_rate": 4.700854700854701e-06, + "loss": 0.0, + "step": 13500 + }, + { + "epoch": 57.007350427350424, + "grad_norm": 0.012751772068440914, + "learning_rate": 4.69610636277303e-06, + "loss": 0.0001, + "step": 13510 + }, + { + "epoch": 57.007777777777775, + "grad_norm": 0.0007130270823836327, + "learning_rate": 4.691358024691358e-06, + "loss": 0.0, + "step": 13520 + }, + { + "epoch": 57.00820512820513, + "grad_norm": 0.0003145153750665486, + "learning_rate": 4.6866096866096875e-06, + "loss": 0.1948, + "step": 13530 + }, + { + "epoch": 57.00863247863248, + "grad_norm": 0.00031738655525259674, + "learning_rate": 4.681861348528016e-06, + "loss": 0.0, + "step": 13540 + }, + { + "epoch": 57.00905982905983, + "grad_norm": 0.0002995093527715653, + "learning_rate": 4.677113010446344e-06, + "loss": 0.0001, + "step": 13550 + }, + { + "epoch": 57.00948717948718, + "grad_norm": 0.00032525527058169246, + "learning_rate": 4.672364672364672e-06, + "loss": 0.0, + "step": 13560 + }, + { + "epoch": 57.00991452991453, + "grad_norm": 0.00266217440366745, + "learning_rate": 4.6676163342830015e-06, + "loss": 0.0, + "step": 13570 + }, + { + "epoch": 57.01, + "eval_accuracy": 0.44, + "eval_loss": 5.7661638259887695, + "eval_runtime": 33.6667, + "eval_samples_per_second": 0.743, + "eval_steps_per_second": 0.743, + "step": 13572 + }, + { + "epoch": 58.00034188034188, + "grad_norm": 956.22998046875, + "learning_rate": 4.66286799620133e-06, + "loss": 0.9196, + "step": 13580 + }, + { + "epoch": 58.00076923076923, + "grad_norm": 0.0004830555117223412, + "learning_rate": 4.658119658119659e-06, + "loss": 0.0, + "step": 13590 + }, + { + "epoch": 58.00119658119658, + "grad_norm": 0.00030270041315816343, + "learning_rate": 4.653371320037987e-06, + "loss": 0.0, + "step": 13600 + }, + { + "epoch": 58.00162393162393, + "grad_norm": 0.0006561644258908927, + "learning_rate": 4.6486229819563155e-06, + "loss": 0.0, + "step": 13610 + }, + { + "epoch": 58.00205128205128, + "grad_norm": 0.0006711311871185899, + "learning_rate": 4.643874643874644e-06, + "loss": 0.0, + "step": 13620 + }, + { + "epoch": 58.002478632478635, + "grad_norm": 0.0025408940855413675, + "learning_rate": 4.639126305792973e-06, + "loss": 0.0, + "step": 13630 + }, + { + "epoch": 58.002905982905986, + "grad_norm": 0.0009754471248015761, + "learning_rate": 4.634377967711301e-06, + "loss": 0.0, + "step": 13640 + }, + { + "epoch": 58.00333333333333, + "grad_norm": 0.0024775350466370583, + "learning_rate": 4.62962962962963e-06, + "loss": 0.0, + "step": 13650 + }, + { + "epoch": 58.00376068376068, + "grad_norm": 0.007733345031738281, + "learning_rate": 4.624881291547959e-06, + "loss": 0.0001, + "step": 13660 + }, + { + "epoch": 58.00418803418803, + "grad_norm": 0.00038200884591788054, + "learning_rate": 4.620132953466287e-06, + "loss": 0.0, + "step": 13670 + }, + { + "epoch": 58.004615384615384, + "grad_norm": 0.00032710927189327776, + "learning_rate": 4.615384615384616e-06, + "loss": 0.0, + "step": 13680 + }, + { + "epoch": 58.005042735042736, + "grad_norm": 0.00046126110828481615, + "learning_rate": 4.6106362773029444e-06, + "loss": 0.0001, + "step": 13690 + }, + { + "epoch": 58.00547008547009, + "grad_norm": 0.0009468531352467835, + "learning_rate": 4.605887939221273e-06, + "loss": 0.0001, + "step": 13700 + }, + { + "epoch": 58.00589743589744, + "grad_norm": 0.0003751327458303422, + "learning_rate": 4.601139601139601e-06, + "loss": 0.0001, + "step": 13710 + }, + { + "epoch": 58.00632478632479, + "grad_norm": 0.0024412404745817184, + "learning_rate": 4.59639126305793e-06, + "loss": 0.0, + "step": 13720 + }, + { + "epoch": 58.006752136752134, + "grad_norm": 0.0011317358585074544, + "learning_rate": 4.5916429249762585e-06, + "loss": 0.0, + "step": 13730 + }, + { + "epoch": 58.007179487179485, + "grad_norm": 0.00213524978607893, + "learning_rate": 4.586894586894588e-06, + "loss": 0.0001, + "step": 13740 + }, + { + "epoch": 58.007606837606836, + "grad_norm": 0.00029954416095279157, + "learning_rate": 4.582146248812916e-06, + "loss": 0.0001, + "step": 13750 + }, + { + "epoch": 58.00803418803419, + "grad_norm": 0.004025555215775967, + "learning_rate": 4.577397910731244e-06, + "loss": 0.0, + "step": 13760 + }, + { + "epoch": 58.00846153846154, + "grad_norm": 0.0070886253379285336, + "learning_rate": 4.5726495726495725e-06, + "loss": 0.5328, + "step": 13770 + }, + { + "epoch": 58.00888888888889, + "grad_norm": 0.0006350260809995234, + "learning_rate": 4.567901234567902e-06, + "loss": 0.0, + "step": 13780 + }, + { + "epoch": 58.00931623931624, + "grad_norm": 89.9181900024414, + "learning_rate": 4.563152896486231e-06, + "loss": 0.0124, + "step": 13790 + }, + { + "epoch": 58.00974358974359, + "grad_norm": 0.000428661733167246, + "learning_rate": 4.558404558404559e-06, + "loss": 0.0001, + "step": 13800 + }, + { + "epoch": 58.01, + "eval_accuracy": 0.44, + "eval_loss": 5.446341037750244, + "eval_runtime": 33.6227, + "eval_samples_per_second": 0.744, + "eval_steps_per_second": 0.744, + "step": 13806 + }, + { + "epoch": 59.00017094017094, + "grad_norm": 0.0007079663337208331, + "learning_rate": 4.553656220322887e-06, + "loss": 0.0001, + "step": 13810 + }, + { + "epoch": 59.00059829059829, + "grad_norm": 0.0012396123493090272, + "learning_rate": 4.548907882241216e-06, + "loss": 1.3237, + "step": 13820 + }, + { + "epoch": 59.00102564102564, + "grad_norm": 0.005205500405281782, + "learning_rate": 4.544159544159544e-06, + "loss": 0.4377, + "step": 13830 + }, + { + "epoch": 59.00145299145299, + "grad_norm": 0.00035839254269376397, + "learning_rate": 4.539411206077873e-06, + "loss": 0.0083, + "step": 13840 + }, + { + "epoch": 59.001880341880344, + "grad_norm": 0.0005130122299306095, + "learning_rate": 4.5346628679962014e-06, + "loss": 0.0, + "step": 13850 + }, + { + "epoch": 59.002307692307696, + "grad_norm": 0.0014134275261312723, + "learning_rate": 4.5299145299145306e-06, + "loss": 0.0, + "step": 13860 + }, + { + "epoch": 59.00273504273504, + "grad_norm": 0.0625193640589714, + "learning_rate": 4.525166191832859e-06, + "loss": 0.0001, + "step": 13870 + }, + { + "epoch": 59.00316239316239, + "grad_norm": 0.0004801045579370111, + "learning_rate": 4.520417853751187e-06, + "loss": 0.0001, + "step": 13880 + }, + { + "epoch": 59.00358974358974, + "grad_norm": 0.0007496966863982379, + "learning_rate": 4.515669515669516e-06, + "loss": 0.1716, + "step": 13890 + }, + { + "epoch": 59.004017094017094, + "grad_norm": 0.0012893045786768198, + "learning_rate": 4.510921177587845e-06, + "loss": 0.0, + "step": 13900 + }, + { + "epoch": 59.004444444444445, + "grad_norm": 0.0003310160245746374, + "learning_rate": 4.506172839506173e-06, + "loss": 0.0001, + "step": 13910 + }, + { + "epoch": 59.0048717948718, + "grad_norm": 44.41610336303711, + "learning_rate": 4.501424501424502e-06, + "loss": 0.0076, + "step": 13920 + }, + { + "epoch": 59.00529914529915, + "grad_norm": 0.0027762632817029953, + "learning_rate": 4.49667616334283e-06, + "loss": 0.0, + "step": 13930 + }, + { + "epoch": 59.0057264957265, + "grad_norm": 0.0003408943011891097, + "learning_rate": 4.4919278252611595e-06, + "loss": 0.0, + "step": 13940 + }, + { + "epoch": 59.00615384615384, + "grad_norm": 0.0014558567199856043, + "learning_rate": 4.487179487179488e-06, + "loss": 0.0, + "step": 13950 + }, + { + "epoch": 59.006581196581195, + "grad_norm": 0.002356948796659708, + "learning_rate": 4.482431149097816e-06, + "loss": 0.0108, + "step": 13960 + }, + { + "epoch": 59.007008547008546, + "grad_norm": 0.0007559444638900459, + "learning_rate": 4.477682811016144e-06, + "loss": 0.6313, + "step": 13970 + }, + { + "epoch": 59.0074358974359, + "grad_norm": 0.004984420258551836, + "learning_rate": 4.4729344729344735e-06, + "loss": 0.1682, + "step": 13980 + }, + { + "epoch": 59.00786324786325, + "grad_norm": 0.0007952677551656961, + "learning_rate": 4.468186134852802e-06, + "loss": 0.9454, + "step": 13990 + }, + { + "epoch": 59.0082905982906, + "grad_norm": 0.02415173314511776, + "learning_rate": 4.463437796771131e-06, + "loss": 0.0001, + "step": 14000 + }, + { + "epoch": 59.00871794871795, + "grad_norm": 0.0003063694457523525, + "learning_rate": 4.458689458689459e-06, + "loss": 0.0001, + "step": 14010 + }, + { + "epoch": 59.0091452991453, + "grad_norm": 0.00032008966081775725, + "learning_rate": 4.4539411206077876e-06, + "loss": 0.4779, + "step": 14020 + }, + { + "epoch": 59.00957264957265, + "grad_norm": 0.00046775556984357536, + "learning_rate": 4.449192782526116e-06, + "loss": 0.0001, + "step": 14030 + }, + { + "epoch": 59.01, + "grad_norm": 0.06117213889956474, + "learning_rate": 4.444444444444444e-06, + "loss": 0.0021, + "step": 14040 + }, + { + "epoch": 59.01, + "eval_accuracy": 0.44, + "eval_loss": 5.957569599151611, + "eval_runtime": 36.0184, + "eval_samples_per_second": 0.694, + "eval_steps_per_second": 0.694, + "step": 14040 + }, + { + "epoch": 60.00042735042735, + "grad_norm": 0.0008756799506954849, + "learning_rate": 4.439696106362773e-06, + "loss": 0.6096, + "step": 14050 + }, + { + "epoch": 60.0008547008547, + "grad_norm": 945.39501953125, + "learning_rate": 4.4349477682811024e-06, + "loss": 0.5127, + "step": 14060 + }, + { + "epoch": 60.001282051282054, + "grad_norm": 0.0003207973495591432, + "learning_rate": 4.430199430199431e-06, + "loss": 0.0, + "step": 14070 + }, + { + "epoch": 60.001709401709405, + "grad_norm": 0.005959488917142153, + "learning_rate": 4.425451092117759e-06, + "loss": 0.0001, + "step": 14080 + }, + { + "epoch": 60.00213675213675, + "grad_norm": 0.0004705670871771872, + "learning_rate": 4.420702754036087e-06, + "loss": 0.0, + "step": 14090 + }, + { + "epoch": 60.0025641025641, + "grad_norm": 0.0004403094353619963, + "learning_rate": 4.4159544159544165e-06, + "loss": 0.0, + "step": 14100 + }, + { + "epoch": 60.00299145299145, + "grad_norm": 707.6570434570312, + "learning_rate": 4.411206077872745e-06, + "loss": 0.7011, + "step": 14110 + }, + { + "epoch": 60.0034188034188, + "grad_norm": 0.00034931182744912803, + "learning_rate": 4.406457739791074e-06, + "loss": 0.0011, + "step": 14120 + }, + { + "epoch": 60.003846153846155, + "grad_norm": 0.00047842093044891953, + "learning_rate": 4.401709401709402e-06, + "loss": 0.7669, + "step": 14130 + }, + { + "epoch": 60.004273504273506, + "grad_norm": 0.0005029200110584497, + "learning_rate": 4.3969610636277305e-06, + "loss": 0.6763, + "step": 14140 + }, + { + "epoch": 60.00470085470086, + "grad_norm": 0.0007298401324078441, + "learning_rate": 4.39221272554606e-06, + "loss": 0.0001, + "step": 14150 + }, + { + "epoch": 60.00512820512821, + "grad_norm": 0.08432787656784058, + "learning_rate": 4.387464387464388e-06, + "loss": 1.019, + "step": 14160 + }, + { + "epoch": 60.00555555555555, + "grad_norm": 0.011608411557972431, + "learning_rate": 4.382716049382716e-06, + "loss": 0.0, + "step": 14170 + }, + { + "epoch": 60.005982905982904, + "grad_norm": 0.0004716445109806955, + "learning_rate": 4.3779677113010445e-06, + "loss": 1.0345, + "step": 14180 + }, + { + "epoch": 60.006410256410255, + "grad_norm": 0.0010066544637084007, + "learning_rate": 4.373219373219374e-06, + "loss": 0.8799, + "step": 14190 + }, + { + "epoch": 60.00683760683761, + "grad_norm": 0.01125133316963911, + "learning_rate": 4.368471035137703e-06, + "loss": 0.0012, + "step": 14200 + }, + { + "epoch": 60.00726495726496, + "grad_norm": 0.0006305111455731094, + "learning_rate": 4.363722697056031e-06, + "loss": 0.7549, + "step": 14210 + }, + { + "epoch": 60.00769230769231, + "grad_norm": 0.003195669502019882, + "learning_rate": 4.358974358974359e-06, + "loss": 0.0, + "step": 14220 + }, + { + "epoch": 60.00811965811966, + "grad_norm": 0.20871169865131378, + "learning_rate": 4.354226020892688e-06, + "loss": 0.0753, + "step": 14230 + }, + { + "epoch": 60.00854700854701, + "grad_norm": 0.0661257803440094, + "learning_rate": 4.349477682811016e-06, + "loss": 0.0001, + "step": 14240 + }, + { + "epoch": 60.008974358974356, + "grad_norm": 0.0009889448992908, + "learning_rate": 4.344729344729345e-06, + "loss": 0.0002, + "step": 14250 + }, + { + "epoch": 60.00940170940171, + "grad_norm": 0.009586402215063572, + "learning_rate": 4.3399810066476735e-06, + "loss": 0.0004, + "step": 14260 + }, + { + "epoch": 60.00982905982906, + "grad_norm": 558.0621337890625, + "learning_rate": 4.335232668566003e-06, + "loss": 0.5042, + "step": 14270 + }, + { + "epoch": 60.01, + "eval_accuracy": 0.4, + "eval_loss": 5.9418511390686035, + "eval_runtime": 37.744, + "eval_samples_per_second": 0.662, + "eval_steps_per_second": 0.662, + "step": 14274 + }, + { + "epoch": 61.00025641025641, + "grad_norm": 0.00028759066481143236, + "learning_rate": 4.330484330484331e-06, + "loss": 0.9175, + "step": 14280 + }, + { + "epoch": 61.00068376068376, + "grad_norm": 0.007706713397055864, + "learning_rate": 4.325735992402659e-06, + "loss": 0.0001, + "step": 14290 + }, + { + "epoch": 61.00111111111111, + "grad_norm": 0.0005172354285605252, + "learning_rate": 4.3209876543209875e-06, + "loss": 0.0, + "step": 14300 + }, + { + "epoch": 61.00153846153846, + "grad_norm": 0.0005246769869700074, + "learning_rate": 4.316239316239317e-06, + "loss": 0.6762, + "step": 14310 + }, + { + "epoch": 61.00196581196581, + "grad_norm": 0.0011663725599646568, + "learning_rate": 4.311490978157645e-06, + "loss": 0.0, + "step": 14320 + }, + { + "epoch": 61.00239316239316, + "grad_norm": 0.005837870761752129, + "learning_rate": 4.306742640075974e-06, + "loss": 0.0907, + "step": 14330 + }, + { + "epoch": 61.00282051282051, + "grad_norm": 0.00042240836773999035, + "learning_rate": 4.301994301994302e-06, + "loss": 0.0, + "step": 14340 + }, + { + "epoch": 61.003247863247864, + "grad_norm": 0.0007428540848195553, + "learning_rate": 4.297245963912631e-06, + "loss": 0.0001, + "step": 14350 + }, + { + "epoch": 61.003675213675216, + "grad_norm": 0.0005527780158445239, + "learning_rate": 4.29249762583096e-06, + "loss": 0.0, + "step": 14360 + }, + { + "epoch": 61.00410256410257, + "grad_norm": 0.001011466607451439, + "learning_rate": 4.287749287749288e-06, + "loss": 0.0001, + "step": 14370 + }, + { + "epoch": 61.00452991452991, + "grad_norm": 17.799911499023438, + "learning_rate": 4.283000949667616e-06, + "loss": 0.0566, + "step": 14380 + }, + { + "epoch": 61.00495726495726, + "grad_norm": 20.882823944091797, + "learning_rate": 4.2782526115859455e-06, + "loss": 0.0043, + "step": 14390 + }, + { + "epoch": 61.005384615384614, + "grad_norm": 0.00030632095877081156, + "learning_rate": 4.273504273504274e-06, + "loss": 0.0004, + "step": 14400 + }, + { + "epoch": 61.005811965811965, + "grad_norm": 0.000464937009382993, + "learning_rate": 4.268755935422603e-06, + "loss": 0.0001, + "step": 14410 + }, + { + "epoch": 61.006239316239316, + "grad_norm": 0.0014673583209514618, + "learning_rate": 4.264007597340931e-06, + "loss": 0.0001, + "step": 14420 + }, + { + "epoch": 61.00666666666667, + "grad_norm": 0.0011050072498619556, + "learning_rate": 4.2592592592592596e-06, + "loss": 0.0, + "step": 14430 + }, + { + "epoch": 61.00709401709402, + "grad_norm": 408.4623107910156, + "learning_rate": 4.254510921177588e-06, + "loss": 0.5193, + "step": 14440 + }, + { + "epoch": 61.00752136752137, + "grad_norm": 0.0010529108112677932, + "learning_rate": 4.249762583095917e-06, + "loss": 0.0011, + "step": 14450 + }, + { + "epoch": 61.007948717948715, + "grad_norm": 0.0004740229342132807, + "learning_rate": 4.245014245014245e-06, + "loss": 0.0001, + "step": 14460 + }, + { + "epoch": 61.008376068376066, + "grad_norm": 0.0004031808057334274, + "learning_rate": 4.2402659069325745e-06, + "loss": 0.0, + "step": 14470 + }, + { + "epoch": 61.00880341880342, + "grad_norm": 0.00028951760032214224, + "learning_rate": 4.235517568850903e-06, + "loss": 0.0, + "step": 14480 + }, + { + "epoch": 61.00923076923077, + "grad_norm": 0.00033271079882979393, + "learning_rate": 4.230769230769231e-06, + "loss": 0.0001, + "step": 14490 + }, + { + "epoch": 61.00965811965812, + "grad_norm": 41.438385009765625, + "learning_rate": 4.226020892687559e-06, + "loss": 0.0053, + "step": 14500 + }, + { + "epoch": 61.01, + "eval_accuracy": 0.48, + "eval_loss": 5.297748565673828, + "eval_runtime": 37.9181, + "eval_samples_per_second": 0.659, + "eval_steps_per_second": 0.659, + "step": 14508 + }, + { + "epoch": 62.00008547008547, + "grad_norm": 0.0023157300893217325, + "learning_rate": 4.221272554605888e-06, + "loss": 0.0, + "step": 14510 + }, + { + "epoch": 62.00051282051282, + "grad_norm": 0.000285146088572219, + "learning_rate": 4.216524216524217e-06, + "loss": 0.0, + "step": 14520 + }, + { + "epoch": 62.00094017094017, + "grad_norm": 0.00043018904398195446, + "learning_rate": 4.211775878442546e-06, + "loss": 0.0, + "step": 14530 + }, + { + "epoch": 62.00136752136752, + "grad_norm": 0.0006764789577573538, + "learning_rate": 4.207027540360874e-06, + "loss": 0.2522, + "step": 14540 + }, + { + "epoch": 62.00179487179487, + "grad_norm": 0.0003662327362690121, + "learning_rate": 4.2022792022792025e-06, + "loss": 0.0, + "step": 14550 + }, + { + "epoch": 62.00222222222222, + "grad_norm": 0.0002613349643070251, + "learning_rate": 4.197530864197531e-06, + "loss": 0.0001, + "step": 14560 + }, + { + "epoch": 62.002649572649574, + "grad_norm": 0.0002579360152594745, + "learning_rate": 4.19278252611586e-06, + "loss": 0.0001, + "step": 14570 + }, + { + "epoch": 62.003076923076925, + "grad_norm": 0.002828446449711919, + "learning_rate": 4.188034188034188e-06, + "loss": 0.0001, + "step": 14580 + }, + { + "epoch": 62.00350427350428, + "grad_norm": 0.02188822254538536, + "learning_rate": 4.1832858499525166e-06, + "loss": 0.0, + "step": 14590 + }, + { + "epoch": 62.00393162393162, + "grad_norm": 0.0006723613478243351, + "learning_rate": 4.178537511870846e-06, + "loss": 0.0, + "step": 14600 + }, + { + "epoch": 62.00435897435897, + "grad_norm": 0.0004710882785730064, + "learning_rate": 4.173789173789174e-06, + "loss": 0.0, + "step": 14610 + }, + { + "epoch": 62.00478632478632, + "grad_norm": 0.0002582537126727402, + "learning_rate": 4.169040835707503e-06, + "loss": 0.0, + "step": 14620 + }, + { + "epoch": 62.005213675213675, + "grad_norm": 0.0019376453710719943, + "learning_rate": 4.1642924976258314e-06, + "loss": 0.0001, + "step": 14630 + }, + { + "epoch": 62.005641025641026, + "grad_norm": 0.00032024920801632106, + "learning_rate": 4.15954415954416e-06, + "loss": 0.0, + "step": 14640 + }, + { + "epoch": 62.00606837606838, + "grad_norm": 0.00024245944223366678, + "learning_rate": 4.154795821462488e-06, + "loss": 0.0, + "step": 14650 + }, + { + "epoch": 62.00649572649573, + "grad_norm": 0.0006904223700985312, + "learning_rate": 4.150047483380817e-06, + "loss": 0.7856, + "step": 14660 + }, + { + "epoch": 62.00692307692308, + "grad_norm": 0.00029405683744698763, + "learning_rate": 4.145299145299146e-06, + "loss": 0.0632, + "step": 14670 + }, + { + "epoch": 62.007350427350424, + "grad_norm": 0.0007452707504853606, + "learning_rate": 4.140550807217475e-06, + "loss": 0.0, + "step": 14680 + }, + { + "epoch": 62.007777777777775, + "grad_norm": 559.78662109375, + "learning_rate": 4.135802469135803e-06, + "loss": 0.3101, + "step": 14690 + }, + { + "epoch": 62.00820512820513, + "grad_norm": 0.02688130922615528, + "learning_rate": 4.131054131054131e-06, + "loss": 0.0001, + "step": 14700 + }, + { + "epoch": 62.00863247863248, + "grad_norm": 2.4815587997436523, + "learning_rate": 4.1263057929724595e-06, + "loss": 0.0005, + "step": 14710 + }, + { + "epoch": 62.00905982905983, + "grad_norm": 0.004581361077725887, + "learning_rate": 4.121557454890789e-06, + "loss": 0.0031, + "step": 14720 + }, + { + "epoch": 62.00948717948718, + "grad_norm": 0.017405280843377113, + "learning_rate": 4.116809116809117e-06, + "loss": 0.308, + "step": 14730 + }, + { + "epoch": 62.00991452991453, + "grad_norm": 0.001918576774187386, + "learning_rate": 4.112060778727446e-06, + "loss": 0.0, + "step": 14740 + }, + { + "epoch": 62.01, + "eval_accuracy": 0.4, + "eval_loss": 5.854069232940674, + "eval_runtime": 32.1126, + "eval_samples_per_second": 0.779, + "eval_steps_per_second": 0.779, + "step": 14742 + }, + { + "epoch": 63.00034188034188, + "grad_norm": 0.004541910719126463, + "learning_rate": 4.107312440645774e-06, + "loss": 0.0, + "step": 14750 + }, + { + "epoch": 63.00076923076923, + "grad_norm": 0.0010953666642308235, + "learning_rate": 4.102564102564103e-06, + "loss": 0.0001, + "step": 14760 + }, + { + "epoch": 63.00119658119658, + "grad_norm": 0.001070062629878521, + "learning_rate": 4.097815764482431e-06, + "loss": 0.0001, + "step": 14770 + }, + { + "epoch": 63.00162393162393, + "grad_norm": 0.001508062588982284, + "learning_rate": 4.09306742640076e-06, + "loss": 0.0001, + "step": 14780 + }, + { + "epoch": 63.00205128205128, + "grad_norm": 0.0003479434235487133, + "learning_rate": 4.088319088319088e-06, + "loss": 0.0, + "step": 14790 + }, + { + "epoch": 63.002478632478635, + "grad_norm": 0.0033716338220983744, + "learning_rate": 4.0835707502374176e-06, + "loss": 0.0001, + "step": 14800 + }, + { + "epoch": 63.002905982905986, + "grad_norm": 0.0008653479162603617, + "learning_rate": 4.078822412155746e-06, + "loss": 0.0173, + "step": 14810 + }, + { + "epoch": 63.00333333333333, + "grad_norm": 0.00030156198772601783, + "learning_rate": 4.074074074074074e-06, + "loss": 0.0, + "step": 14820 + }, + { + "epoch": 63.00376068376068, + "grad_norm": 0.0004595111822709441, + "learning_rate": 4.069325735992403e-06, + "loss": 0.0, + "step": 14830 + }, + { + "epoch": 63.00418803418803, + "grad_norm": 0.0010495609603822231, + "learning_rate": 4.064577397910732e-06, + "loss": 0.0, + "step": 14840 + }, + { + "epoch": 63.004615384615384, + "grad_norm": 0.004493600223213434, + "learning_rate": 4.05982905982906e-06, + "loss": 0.6305, + "step": 14850 + }, + { + "epoch": 63.005042735042736, + "grad_norm": 0.0505562499165535, + "learning_rate": 4.055080721747389e-06, + "loss": 0.0016, + "step": 14860 + }, + { + "epoch": 63.00547008547009, + "grad_norm": 0.002653477480635047, + "learning_rate": 4.050332383665717e-06, + "loss": 0.0001, + "step": 14870 + }, + { + "epoch": 63.00589743589744, + "grad_norm": 0.0020723838824778795, + "learning_rate": 4.0455840455840465e-06, + "loss": 0.0001, + "step": 14880 + }, + { + "epoch": 63.00632478632479, + "grad_norm": 0.0002628070651553571, + "learning_rate": 4.040835707502375e-06, + "loss": 0.0004, + "step": 14890 + }, + { + "epoch": 63.006752136752134, + "grad_norm": 0.0003271848545409739, + "learning_rate": 4.036087369420703e-06, + "loss": 0.0001, + "step": 14900 + }, + { + "epoch": 63.007179487179485, + "grad_norm": 0.0002233712439192459, + "learning_rate": 4.031339031339031e-06, + "loss": 0.0013, + "step": 14910 + }, + { + "epoch": 63.007606837606836, + "grad_norm": 0.000736577610950917, + "learning_rate": 4.02659069325736e-06, + "loss": 0.0, + "step": 14920 + }, + { + "epoch": 63.00803418803419, + "grad_norm": 0.0002595110272523016, + "learning_rate": 4.021842355175689e-06, + "loss": 0.0, + "step": 14930 + }, + { + "epoch": 63.00846153846154, + "grad_norm": 0.00025847507640719414, + "learning_rate": 4.017094017094018e-06, + "loss": 0.0, + "step": 14940 + }, + { + "epoch": 63.00888888888889, + "grad_norm": 0.0003213146119378507, + "learning_rate": 4.012345679012346e-06, + "loss": 0.0, + "step": 14950 + }, + { + "epoch": 63.00931623931624, + "grad_norm": 111.42301177978516, + "learning_rate": 4.0075973409306745e-06, + "loss": 0.0102, + "step": 14960 + }, + { + "epoch": 63.00974358974359, + "grad_norm": 705.5123291015625, + "learning_rate": 4.002849002849003e-06, + "loss": 0.1555, + "step": 14970 + }, + { + "epoch": 63.01, + "eval_accuracy": 0.4, + "eval_loss": 6.5367207527160645, + "eval_runtime": 31.9232, + "eval_samples_per_second": 0.783, + "eval_steps_per_second": 0.783, + "step": 14976 + }, + { + "epoch": 64.00017094017095, + "grad_norm": 0.42229506373405457, + "learning_rate": 3.998100664767331e-06, + "loss": 0.1863, + "step": 14980 + }, + { + "epoch": 64.00059829059829, + "grad_norm": 0.00032686052145436406, + "learning_rate": 3.99335232668566e-06, + "loss": 0.0, + "step": 14990 + }, + { + "epoch": 64.00102564102563, + "grad_norm": 0.0032908704597502947, + "learning_rate": 3.9886039886039894e-06, + "loss": 0.0002, + "step": 15000 + }, + { + "epoch": 64.001452991453, + "grad_norm": 3.955153703689575, + "learning_rate": 3.983855650522318e-06, + "loss": 0.0007, + "step": 15010 + }, + { + "epoch": 64.00188034188034, + "grad_norm": 0.0003446421178523451, + "learning_rate": 3.979107312440646e-06, + "loss": 1.0107, + "step": 15020 + }, + { + "epoch": 64.0023076923077, + "grad_norm": 0.002301581669598818, + "learning_rate": 3.974358974358974e-06, + "loss": 0.0, + "step": 15030 + }, + { + "epoch": 64.00273504273504, + "grad_norm": 0.0007779121515341103, + "learning_rate": 3.9696106362773035e-06, + "loss": 0.7845, + "step": 15040 + }, + { + "epoch": 64.0031623931624, + "grad_norm": 0.11843976378440857, + "learning_rate": 3.964862298195632e-06, + "loss": 0.1222, + "step": 15050 + }, + { + "epoch": 64.00358974358974, + "grad_norm": 0.020959218963980675, + "learning_rate": 3.96011396011396e-06, + "loss": 0.0, + "step": 15060 + }, + { + "epoch": 64.0040170940171, + "grad_norm": 506.3919982910156, + "learning_rate": 3.955365622032289e-06, + "loss": 0.0546, + "step": 15070 + }, + { + "epoch": 64.00444444444445, + "grad_norm": 0.0002198971196776256, + "learning_rate": 3.9506172839506175e-06, + "loss": 0.0, + "step": 15080 + }, + { + "epoch": 64.00487179487179, + "grad_norm": 0.008080464787781239, + "learning_rate": 3.945868945868947e-06, + "loss": 0.0001, + "step": 15090 + }, + { + "epoch": 64.00529914529915, + "grad_norm": 0.002219392219558358, + "learning_rate": 3.941120607787275e-06, + "loss": 0.0007, + "step": 15100 + }, + { + "epoch": 64.00572649572649, + "grad_norm": 0.0003887212951667607, + "learning_rate": 3.936372269705603e-06, + "loss": 0.0016, + "step": 15110 + }, + { + "epoch": 64.00615384615385, + "grad_norm": 0.001071627251803875, + "learning_rate": 3.9316239316239315e-06, + "loss": 0.0, + "step": 15120 + }, + { + "epoch": 64.0065811965812, + "grad_norm": 0.00028002672479487956, + "learning_rate": 3.926875593542261e-06, + "loss": 0.0, + "step": 15130 + }, + { + "epoch": 64.00700854700855, + "grad_norm": 0.00031417288118973374, + "learning_rate": 3.922127255460589e-06, + "loss": 0.0, + "step": 15140 + }, + { + "epoch": 64.0074358974359, + "grad_norm": 0.0010667052119970322, + "learning_rate": 3.917378917378918e-06, + "loss": 0.0273, + "step": 15150 + }, + { + "epoch": 64.00786324786324, + "grad_norm": 0.0006762260454706848, + "learning_rate": 3.912630579297246e-06, + "loss": 0.051, + "step": 15160 + }, + { + "epoch": 64.0082905982906, + "grad_norm": 0.006420073565095663, + "learning_rate": 3.907882241215575e-06, + "loss": 0.0, + "step": 15170 + }, + { + "epoch": 64.00871794871794, + "grad_norm": 0.0005878534284420311, + "learning_rate": 3.903133903133903e-06, + "loss": 0.0, + "step": 15180 + }, + { + "epoch": 64.0091452991453, + "grad_norm": 0.0019342320738360286, + "learning_rate": 3.898385565052232e-06, + "loss": 0.0, + "step": 15190 + }, + { + "epoch": 64.00957264957265, + "grad_norm": 0.00036040143459104, + "learning_rate": 3.8936372269705604e-06, + "loss": 0.0002, + "step": 15200 + }, + { + "epoch": 64.01, + "grad_norm": 0.0003253432805649936, + "learning_rate": 3.88888888888889e-06, + "loss": 0.0081, + "step": 15210 + }, + { + "epoch": 64.01, + "eval_accuracy": 0.4, + "eval_loss": 5.480795383453369, + "eval_runtime": 31.9103, + "eval_samples_per_second": 0.783, + "eval_steps_per_second": 0.783, + "step": 15210 + }, + { + "epoch": 65.00042735042734, + "grad_norm": 0.0006786137819290161, + "learning_rate": 3.884140550807218e-06, + "loss": 0.0001, + "step": 15220 + }, + { + "epoch": 65.0008547008547, + "grad_norm": 0.00022978027118369937, + "learning_rate": 3.879392212725546e-06, + "loss": 0.0, + "step": 15230 + }, + { + "epoch": 65.00128205128205, + "grad_norm": 0.0007715200772508979, + "learning_rate": 3.8746438746438745e-06, + "loss": 0.003, + "step": 15240 + }, + { + "epoch": 65.0017094017094, + "grad_norm": 0.02402709610760212, + "learning_rate": 3.869895536562204e-06, + "loss": 0.0, + "step": 15250 + }, + { + "epoch": 65.00213675213675, + "grad_norm": 0.0015532065881416202, + "learning_rate": 3.865147198480532e-06, + "loss": 0.0197, + "step": 15260 + }, + { + "epoch": 65.00256410256411, + "grad_norm": 85.6712417602539, + "learning_rate": 3.860398860398861e-06, + "loss": 1.1, + "step": 15270 + }, + { + "epoch": 65.00299145299145, + "grad_norm": 0.0003184400557074696, + "learning_rate": 3.855650522317189e-06, + "loss": 0.0012, + "step": 15280 + }, + { + "epoch": 65.00341880341881, + "grad_norm": 0.0002915811201091856, + "learning_rate": 3.850902184235518e-06, + "loss": 0.0002, + "step": 15290 + }, + { + "epoch": 65.00384615384615, + "grad_norm": 0.001770331640727818, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0023, + "step": 15300 + }, + { + "epoch": 65.0042735042735, + "grad_norm": 0.0037825191393494606, + "learning_rate": 3.841405508072175e-06, + "loss": 0.0, + "step": 15310 + }, + { + "epoch": 65.00470085470086, + "grad_norm": 0.00039001365075819194, + "learning_rate": 3.836657169990503e-06, + "loss": 0.0, + "step": 15320 + }, + { + "epoch": 65.0051282051282, + "grad_norm": 0.029118506237864494, + "learning_rate": 3.8319088319088325e-06, + "loss": 0.0001, + "step": 15330 + }, + { + "epoch": 65.00555555555556, + "grad_norm": 0.004345818888396025, + "learning_rate": 3.827160493827161e-06, + "loss": 0.0001, + "step": 15340 + }, + { + "epoch": 65.0059829059829, + "grad_norm": 0.0013317528646439314, + "learning_rate": 3.82241215574549e-06, + "loss": 0.0, + "step": 15350 + }, + { + "epoch": 65.00641025641026, + "grad_norm": 0.00033316414919681847, + "learning_rate": 3.817663817663818e-06, + "loss": 0.0, + "step": 15360 + }, + { + "epoch": 65.0068376068376, + "grad_norm": 0.006611653603613377, + "learning_rate": 3.8129154795821466e-06, + "loss": 0.0001, + "step": 15370 + }, + { + "epoch": 65.00726495726495, + "grad_norm": 0.0014399340143427253, + "learning_rate": 3.808167141500475e-06, + "loss": 0.0024, + "step": 15380 + }, + { + "epoch": 65.00769230769231, + "grad_norm": 0.000310930801788345, + "learning_rate": 3.8034188034188036e-06, + "loss": 0.0001, + "step": 15390 + }, + { + "epoch": 65.00811965811965, + "grad_norm": 0.0017023945692926645, + "learning_rate": 3.7986704653371327e-06, + "loss": 0.0, + "step": 15400 + }, + { + "epoch": 65.00854700854701, + "grad_norm": 0.01913696527481079, + "learning_rate": 3.793922127255461e-06, + "loss": 0.0, + "step": 15410 + }, + { + "epoch": 65.00897435897436, + "grad_norm": 0.00039296565228141844, + "learning_rate": 3.7891737891737893e-06, + "loss": 0.0, + "step": 15420 + }, + { + "epoch": 65.00940170940171, + "grad_norm": 0.00039896511589176953, + "learning_rate": 3.784425451092118e-06, + "loss": 1.0074, + "step": 15430 + }, + { + "epoch": 65.00982905982906, + "grad_norm": 0.0005804836982861161, + "learning_rate": 3.7796771130104463e-06, + "loss": 0.0008, + "step": 15440 + }, + { + "epoch": 65.01, + "eval_accuracy": 0.4, + "eval_loss": 5.881788730621338, + "eval_runtime": 35.8586, + "eval_samples_per_second": 0.697, + "eval_steps_per_second": 0.697, + "step": 15444 + }, + { + "epoch": 66.00025641025641, + "grad_norm": 0.0012229053536430001, + "learning_rate": 3.774928774928775e-06, + "loss": 0.0001, + "step": 15450 + }, + { + "epoch": 66.00068376068376, + "grad_norm": 0.0003784662112593651, + "learning_rate": 3.770180436847104e-06, + "loss": 0.0001, + "step": 15460 + }, + { + "epoch": 66.00111111111111, + "grad_norm": 0.0005255657597444952, + "learning_rate": 3.7654320987654325e-06, + "loss": 0.0001, + "step": 15470 + }, + { + "epoch": 66.00153846153846, + "grad_norm": 0.00034700758988037705, + "learning_rate": 3.760683760683761e-06, + "loss": 0.0005, + "step": 15480 + }, + { + "epoch": 66.00196581196582, + "grad_norm": 0.00021708589338231832, + "learning_rate": 3.7559354226020895e-06, + "loss": 0.0, + "step": 15490 + }, + { + "epoch": 66.00239316239316, + "grad_norm": 0.0038388799875974655, + "learning_rate": 3.751187084520418e-06, + "loss": 0.0, + "step": 15500 + }, + { + "epoch": 66.0028205128205, + "grad_norm": 0.0007136081694625318, + "learning_rate": 3.746438746438747e-06, + "loss": 0.0001, + "step": 15510 + }, + { + "epoch": 66.00324786324786, + "grad_norm": 0.0002888034505303949, + "learning_rate": 3.7416904083570752e-06, + "loss": 0.0, + "step": 15520 + }, + { + "epoch": 66.00367521367521, + "grad_norm": 720.2564697265625, + "learning_rate": 3.736942070275404e-06, + "loss": 0.2916, + "step": 15530 + }, + { + "epoch": 66.00410256410257, + "grad_norm": 0.00027234895969741046, + "learning_rate": 3.7321937321937323e-06, + "loss": 0.0003, + "step": 15540 + }, + { + "epoch": 66.00452991452991, + "grad_norm": 0.0012739448575302958, + "learning_rate": 3.727445394112061e-06, + "loss": 0.0, + "step": 15550 + }, + { + "epoch": 66.00495726495727, + "grad_norm": 0.0005203865002840757, + "learning_rate": 3.7226970560303897e-06, + "loss": 0.0, + "step": 15560 + }, + { + "epoch": 66.00538461538461, + "grad_norm": 0.0003277095383964479, + "learning_rate": 3.7179487179487184e-06, + "loss": 0.0, + "step": 15570 + }, + { + "epoch": 66.00581196581197, + "grad_norm": 0.00044921765220351517, + "learning_rate": 3.7132003798670467e-06, + "loss": 0.0, + "step": 15580 + }, + { + "epoch": 66.00623931623932, + "grad_norm": 0.0003073754196520895, + "learning_rate": 3.7084520417853754e-06, + "loss": 0.0, + "step": 15590 + }, + { + "epoch": 66.00666666666666, + "grad_norm": 0.00030189528479240835, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.0001, + "step": 15600 + }, + { + "epoch": 66.00709401709402, + "grad_norm": 0.0005707856616936624, + "learning_rate": 3.698955365622033e-06, + "loss": 0.0001, + "step": 15610 + }, + { + "epoch": 66.00752136752136, + "grad_norm": 0.00040740257827565074, + "learning_rate": 3.694207027540361e-06, + "loss": 0.0, + "step": 15620 + }, + { + "epoch": 66.00794871794872, + "grad_norm": 0.00039323364035226405, + "learning_rate": 3.68945868945869e-06, + "loss": 0.0, + "step": 15630 + }, + { + "epoch": 66.00837606837607, + "grad_norm": 0.0014208744978532195, + "learning_rate": 3.684710351377018e-06, + "loss": 0.0, + "step": 15640 + }, + { + "epoch": 66.00880341880342, + "grad_norm": 0.0007801980245858431, + "learning_rate": 3.679962013295347e-06, + "loss": 0.0001, + "step": 15650 + }, + { + "epoch": 66.00923076923077, + "grad_norm": 0.0017435887129977345, + "learning_rate": 3.6752136752136756e-06, + "loss": 0.0001, + "step": 15660 + }, + { + "epoch": 66.00965811965811, + "grad_norm": 0.03651350364089012, + "learning_rate": 3.6704653371320044e-06, + "loss": 0.0, + "step": 15670 + }, + { + "epoch": 66.01, + "eval_accuracy": 0.4, + "eval_loss": 6.4378485679626465, + "eval_runtime": 33.2393, + "eval_samples_per_second": 0.752, + "eval_steps_per_second": 0.752, + "step": 15678 + }, + { + "epoch": 67.00008547008547, + "grad_norm": 0.00035544627462513745, + "learning_rate": 3.6657169990503327e-06, + "loss": 0.0001, + "step": 15680 + }, + { + "epoch": 67.00051282051282, + "grad_norm": 0.00033005906152538955, + "learning_rate": 3.6609686609686614e-06, + "loss": 0.0, + "step": 15690 + }, + { + "epoch": 67.00094017094017, + "grad_norm": 0.0003103645867668092, + "learning_rate": 3.6562203228869897e-06, + "loss": 0.0, + "step": 15700 + }, + { + "epoch": 67.00136752136753, + "grad_norm": 0.0008121723076328635, + "learning_rate": 3.651471984805318e-06, + "loss": 0.0, + "step": 15710 + }, + { + "epoch": 67.00179487179487, + "grad_norm": 0.0004981961101293564, + "learning_rate": 3.646723646723647e-06, + "loss": 0.0, + "step": 15720 + }, + { + "epoch": 67.00222222222222, + "grad_norm": 0.0003290316089987755, + "learning_rate": 3.641975308641976e-06, + "loss": 0.0, + "step": 15730 + }, + { + "epoch": 67.00264957264957, + "grad_norm": 0.0010973671451210976, + "learning_rate": 3.637226970560304e-06, + "loss": 0.0, + "step": 15740 + }, + { + "epoch": 67.00307692307692, + "grad_norm": 0.0006377913523465395, + "learning_rate": 3.632478632478633e-06, + "loss": 0.0, + "step": 15750 + }, + { + "epoch": 67.00350427350428, + "grad_norm": 0.0005305053782649338, + "learning_rate": 3.627730294396961e-06, + "loss": 0.0, + "step": 15760 + }, + { + "epoch": 67.00393162393162, + "grad_norm": 0.0019237673841416836, + "learning_rate": 3.6229819563152903e-06, + "loss": 0.0006, + "step": 15770 + }, + { + "epoch": 67.00435897435898, + "grad_norm": 0.0016178624937310815, + "learning_rate": 3.6182336182336186e-06, + "loss": 0.0, + "step": 15780 + }, + { + "epoch": 67.00478632478632, + "grad_norm": 0.00046172275324352086, + "learning_rate": 3.6134852801519473e-06, + "loss": 0.0, + "step": 15790 + }, + { + "epoch": 67.00521367521368, + "grad_norm": 0.0002967897162307054, + "learning_rate": 3.6087369420702756e-06, + "loss": 0.0001, + "step": 15800 + }, + { + "epoch": 67.00564102564103, + "grad_norm": 0.0002238056476926431, + "learning_rate": 3.603988603988604e-06, + "loss": 0.0941, + "step": 15810 + }, + { + "epoch": 67.00606837606837, + "grad_norm": 0.00390687957406044, + "learning_rate": 3.599240265906933e-06, + "loss": 0.001, + "step": 15820 + }, + { + "epoch": 67.00649572649573, + "grad_norm": 0.0019423745106905699, + "learning_rate": 3.5944919278252618e-06, + "loss": 0.0001, + "step": 15830 + }, + { + "epoch": 67.00692307692307, + "grad_norm": 0.00027149979723617435, + "learning_rate": 3.58974358974359e-06, + "loss": 0.0, + "step": 15840 + }, + { + "epoch": 67.00735042735043, + "grad_norm": 0.0007080771028995514, + "learning_rate": 3.5849952516619184e-06, + "loss": 0.0, + "step": 15850 + }, + { + "epoch": 67.00777777777778, + "grad_norm": 0.0003103041963186115, + "learning_rate": 3.580246913580247e-06, + "loss": 0.0135, + "step": 15860 + }, + { + "epoch": 67.00820512820513, + "grad_norm": 0.001910716644488275, + "learning_rate": 3.5754985754985762e-06, + "loss": 0.0, + "step": 15870 + }, + { + "epoch": 67.00863247863248, + "grad_norm": 904.8418579101562, + "learning_rate": 3.5707502374169045e-06, + "loss": 0.3092, + "step": 15880 + }, + { + "epoch": 67.00905982905982, + "grad_norm": 0.0006746945437043905, + "learning_rate": 3.566001899335233e-06, + "loss": 0.0, + "step": 15890 + }, + { + "epoch": 67.00948717948718, + "grad_norm": 0.0018293018219992518, + "learning_rate": 3.5612535612535615e-06, + "loss": 0.0001, + "step": 15900 + }, + { + "epoch": 67.00991452991452, + "grad_norm": 0.007653082720935345, + "learning_rate": 3.55650522317189e-06, + "loss": 0.0, + "step": 15910 + }, + { + "epoch": 67.01, + "eval_accuracy": 0.4, + "eval_loss": 5.659733772277832, + "eval_runtime": 35.0596, + "eval_samples_per_second": 0.713, + "eval_steps_per_second": 0.713, + "step": 15912 + }, + { + "epoch": 68.00034188034188, + "grad_norm": 0.0002990284119732678, + "learning_rate": 3.551756885090219e-06, + "loss": 0.0272, + "step": 15920 + }, + { + "epoch": 68.00076923076924, + "grad_norm": 0.0034541944041848183, + "learning_rate": 3.5470085470085473e-06, + "loss": 0.0001, + "step": 15930 + }, + { + "epoch": 68.00119658119658, + "grad_norm": 0.00031465632491745055, + "learning_rate": 3.542260208926876e-06, + "loss": 0.0, + "step": 15940 + }, + { + "epoch": 68.00162393162393, + "grad_norm": 0.0003612114815041423, + "learning_rate": 3.5375118708452043e-06, + "loss": 0.0002, + "step": 15950 + }, + { + "epoch": 68.00205128205128, + "grad_norm": 0.0013253976358100772, + "learning_rate": 3.532763532763533e-06, + "loss": 0.0, + "step": 15960 + }, + { + "epoch": 68.00247863247863, + "grad_norm": 0.0014574190136045218, + "learning_rate": 3.5280151946818613e-06, + "loss": 0.0, + "step": 15970 + }, + { + "epoch": 68.00290598290599, + "grad_norm": 0.0002389967121416703, + "learning_rate": 3.5232668566001904e-06, + "loss": 0.0003, + "step": 15980 + }, + { + "epoch": 68.00333333333333, + "grad_norm": 0.0006442684680223465, + "learning_rate": 3.5185185185185187e-06, + "loss": 0.0, + "step": 15990 + }, + { + "epoch": 68.00376068376069, + "grad_norm": 0.0004441600467544049, + "learning_rate": 3.5137701804368475e-06, + "loss": 0.0, + "step": 16000 + }, + { + "epoch": 68.00418803418803, + "grad_norm": 0.0006568527896888554, + "learning_rate": 3.5090218423551758e-06, + "loss": 0.0, + "step": 16010 + }, + { + "epoch": 68.00461538461539, + "grad_norm": 0.0003113803395535797, + "learning_rate": 3.5042735042735045e-06, + "loss": 0.0, + "step": 16020 + }, + { + "epoch": 68.00504273504274, + "grad_norm": 0.0012534753186628222, + "learning_rate": 3.499525166191833e-06, + "loss": 0.0, + "step": 16030 + }, + { + "epoch": 68.00547008547008, + "grad_norm": 0.0026429889257997274, + "learning_rate": 3.494776828110162e-06, + "loss": 0.2954, + "step": 16040 + }, + { + "epoch": 68.00589743589744, + "grad_norm": 0.0012878195848315954, + "learning_rate": 3.4900284900284902e-06, + "loss": 0.0, + "step": 16050 + }, + { + "epoch": 68.00632478632478, + "grad_norm": 0.0004001953057013452, + "learning_rate": 3.485280151946819e-06, + "loss": 0.4867, + "step": 16060 + }, + { + "epoch": 68.00675213675214, + "grad_norm": 0.004468402359634638, + "learning_rate": 3.4805318138651472e-06, + "loss": 0.0006, + "step": 16070 + }, + { + "epoch": 68.00717948717949, + "grad_norm": 0.002943785861134529, + "learning_rate": 3.4757834757834764e-06, + "loss": 0.0002, + "step": 16080 + }, + { + "epoch": 68.00760683760684, + "grad_norm": 0.002336192177608609, + "learning_rate": 3.4710351377018047e-06, + "loss": 0.8271, + "step": 16090 + }, + { + "epoch": 68.00803418803419, + "grad_norm": 0.0003368109464645386, + "learning_rate": 3.4662867996201334e-06, + "loss": 0.3306, + "step": 16100 + }, + { + "epoch": 68.00846153846153, + "grad_norm": 0.005345764569938183, + "learning_rate": 3.4615384615384617e-06, + "loss": 0.498, + "step": 16110 + }, + { + "epoch": 68.00888888888889, + "grad_norm": 0.005093185696750879, + "learning_rate": 3.4567901234567904e-06, + "loss": 0.0001, + "step": 16120 + }, + { + "epoch": 68.00931623931623, + "grad_norm": 0.00032298368751071393, + "learning_rate": 3.452041785375119e-06, + "loss": 0.0, + "step": 16130 + }, + { + "epoch": 68.0097435897436, + "grad_norm": 0.0008845495176501572, + "learning_rate": 3.447293447293448e-06, + "loss": 0.0, + "step": 16140 + }, + { + "epoch": 68.01, + "eval_accuracy": 0.44, + "eval_loss": 5.819715976715088, + "eval_runtime": 32.8708, + "eval_samples_per_second": 0.761, + "eval_steps_per_second": 0.761, + "step": 16146 + }, + { + "epoch": 69.00017094017095, + "grad_norm": 0.003377682762220502, + "learning_rate": 3.442545109211776e-06, + "loss": 0.0002, + "step": 16150 + }, + { + "epoch": 69.00059829059829, + "grad_norm": 0.0002674728457350284, + "learning_rate": 3.437796771130105e-06, + "loss": 0.0, + "step": 16160 + }, + { + "epoch": 69.00102564102563, + "grad_norm": 0.0016994690522551537, + "learning_rate": 3.433048433048433e-06, + "loss": 0.0, + "step": 16170 + }, + { + "epoch": 69.001452991453, + "grad_norm": 0.00035284223849885166, + "learning_rate": 3.4283000949667615e-06, + "loss": 0.0, + "step": 16180 + }, + { + "epoch": 69.00188034188034, + "grad_norm": 0.00023986499581951648, + "learning_rate": 3.4235517568850906e-06, + "loss": 0.0, + "step": 16190 + }, + { + "epoch": 69.0023076923077, + "grad_norm": 0.0023895238991826773, + "learning_rate": 3.4188034188034193e-06, + "loss": 0.0, + "step": 16200 + }, + { + "epoch": 69.00273504273504, + "grad_norm": 0.0018031138461083174, + "learning_rate": 3.4140550807217476e-06, + "loss": 0.0, + "step": 16210 + }, + { + "epoch": 69.0031623931624, + "grad_norm": 0.003573950147256255, + "learning_rate": 3.409306742640076e-06, + "loss": 0.0, + "step": 16220 + }, + { + "epoch": 69.00358974358974, + "grad_norm": 0.0015754502965137362, + "learning_rate": 3.4045584045584046e-06, + "loss": 0.0, + "step": 16230 + }, + { + "epoch": 69.0040170940171, + "grad_norm": 0.0012102372711524367, + "learning_rate": 3.3998100664767338e-06, + "loss": 0.0, + "step": 16240 + }, + { + "epoch": 69.00444444444445, + "grad_norm": 0.0002841241657733917, + "learning_rate": 3.395061728395062e-06, + "loss": 0.0, + "step": 16250 + }, + { + "epoch": 69.00487179487179, + "grad_norm": 0.01132782083004713, + "learning_rate": 3.3903133903133904e-06, + "loss": 0.0001, + "step": 16260 + }, + { + "epoch": 69.00529914529915, + "grad_norm": 0.00031476642470806837, + "learning_rate": 3.385565052231719e-06, + "loss": 0.0001, + "step": 16270 + }, + { + "epoch": 69.00572649572649, + "grad_norm": 0.000270717719104141, + "learning_rate": 3.3808167141500474e-06, + "loss": 0.0, + "step": 16280 + }, + { + "epoch": 69.00615384615385, + "grad_norm": 0.00036470277700573206, + "learning_rate": 3.3760683760683765e-06, + "loss": 0.0, + "step": 16290 + }, + { + "epoch": 69.0065811965812, + "grad_norm": 0.001150319236330688, + "learning_rate": 3.371320037986705e-06, + "loss": 0.0, + "step": 16300 + }, + { + "epoch": 69.00700854700855, + "grad_norm": 0.0003526509099174291, + "learning_rate": 3.3665716999050336e-06, + "loss": 0.0, + "step": 16310 + }, + { + "epoch": 69.0074358974359, + "grad_norm": 0.0005036306101828814, + "learning_rate": 3.361823361823362e-06, + "loss": 0.0, + "step": 16320 + }, + { + "epoch": 69.00786324786324, + "grad_norm": 0.0007515393663197756, + "learning_rate": 3.3570750237416906e-06, + "loss": 0.0, + "step": 16330 + }, + { + "epoch": 69.0082905982906, + "grad_norm": 0.0001931802835315466, + "learning_rate": 3.3523266856600197e-06, + "loss": 0.0, + "step": 16340 + }, + { + "epoch": 69.00871794871794, + "grad_norm": 0.00042826347635127604, + "learning_rate": 3.347578347578348e-06, + "loss": 0.0001, + "step": 16350 + }, + { + "epoch": 69.0091452991453, + "grad_norm": 0.00022397881548386067, + "learning_rate": 3.3428300094966763e-06, + "loss": 0.0, + "step": 16360 + }, + { + "epoch": 69.00957264957265, + "grad_norm": 0.00021716070477850735, + "learning_rate": 3.338081671415005e-06, + "loss": 0.0, + "step": 16370 + }, + { + "epoch": 69.01, + "grad_norm": 0.00088693160796538, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0061, + "step": 16380 + }, + { + "epoch": 69.01, + "eval_accuracy": 0.4, + "eval_loss": 6.014081001281738, + "eval_runtime": 33.8632, + "eval_samples_per_second": 0.738, + "eval_steps_per_second": 0.738, + "step": 16380 + }, + { + "epoch": 70.00042735042734, + "grad_norm": 0.0004485013196244836, + "learning_rate": 3.3285849952516625e-06, + "loss": 0.0, + "step": 16390 + }, + { + "epoch": 70.0008547008547, + "grad_norm": 0.0004966802662238479, + "learning_rate": 3.3238366571699908e-06, + "loss": 0.0, + "step": 16400 + }, + { + "epoch": 70.00128205128205, + "grad_norm": 0.0006574277649633586, + "learning_rate": 3.3190883190883195e-06, + "loss": 0.001, + "step": 16410 + }, + { + "epoch": 70.0017094017094, + "grad_norm": 0.21187789738178253, + "learning_rate": 3.3143399810066478e-06, + "loss": 0.0001, + "step": 16420 + }, + { + "epoch": 70.00213675213675, + "grad_norm": 0.00020025823323521763, + "learning_rate": 3.3095916429249765e-06, + "loss": 0.0, + "step": 16430 + }, + { + "epoch": 70.00256410256411, + "grad_norm": 0.0003043335455004126, + "learning_rate": 3.304843304843305e-06, + "loss": 0.0, + "step": 16440 + }, + { + "epoch": 70.00299145299145, + "grad_norm": 0.0003146652306895703, + "learning_rate": 3.300094966761634e-06, + "loss": 0.0, + "step": 16450 + }, + { + "epoch": 70.00341880341881, + "grad_norm": 0.00840036105364561, + "learning_rate": 3.2953466286799622e-06, + "loss": 1.0593, + "step": 16460 + }, + { + "epoch": 70.00384615384615, + "grad_norm": 0.0003192836884409189, + "learning_rate": 3.290598290598291e-06, + "loss": 0.0, + "step": 16470 + }, + { + "epoch": 70.0042735042735, + "grad_norm": 0.0012308260193094611, + "learning_rate": 3.2858499525166193e-06, + "loss": 0.0, + "step": 16480 + }, + { + "epoch": 70.00470085470086, + "grad_norm": 0.0008993004448711872, + "learning_rate": 3.281101614434948e-06, + "loss": 0.0, + "step": 16490 + }, + { + "epoch": 70.0051282051282, + "grad_norm": 0.0010926368413493037, + "learning_rate": 3.2763532763532767e-06, + "loss": 0.0, + "step": 16500 + }, + { + "epoch": 70.00555555555556, + "grad_norm": 0.0005557582480832934, + "learning_rate": 3.2716049382716054e-06, + "loss": 0.0001, + "step": 16510 + }, + { + "epoch": 70.0059829059829, + "grad_norm": 0.0008928438182920218, + "learning_rate": 3.2668566001899337e-06, + "loss": 0.0, + "step": 16520 + }, + { + "epoch": 70.00641025641026, + "grad_norm": 0.0003105592622887343, + "learning_rate": 3.2621082621082624e-06, + "loss": 0.0001, + "step": 16530 + }, + { + "epoch": 70.0068376068376, + "grad_norm": 0.0003956545260734856, + "learning_rate": 3.2573599240265907e-06, + "loss": 0.0, + "step": 16540 + }, + { + "epoch": 70.00726495726495, + "grad_norm": 0.0002564324822742492, + "learning_rate": 3.25261158594492e-06, + "loss": 0.0066, + "step": 16550 + }, + { + "epoch": 70.00769230769231, + "grad_norm": 0.00025351883959956467, + "learning_rate": 3.247863247863248e-06, + "loss": 0.0, + "step": 16560 + }, + { + "epoch": 70.00811965811965, + "grad_norm": 0.00046608541742898524, + "learning_rate": 3.243114909781577e-06, + "loss": 0.0023, + "step": 16570 + }, + { + "epoch": 70.00854700854701, + "grad_norm": 0.002306720009073615, + "learning_rate": 3.238366571699905e-06, + "loss": 0.0, + "step": 16580 + }, + { + "epoch": 70.00897435897436, + "grad_norm": 0.0005604177713394165, + "learning_rate": 3.2336182336182335e-06, + "loss": 0.0, + "step": 16590 + }, + { + "epoch": 70.00940170940171, + "grad_norm": 0.0008777379989624023, + "learning_rate": 3.2288698955365626e-06, + "loss": 0.0, + "step": 16600 + }, + { + "epoch": 70.00982905982906, + "grad_norm": 0.0015941828023642302, + "learning_rate": 3.2241215574548913e-06, + "loss": 0.0001, + "step": 16610 + }, + { + "epoch": 70.01, + "eval_accuracy": 0.4, + "eval_loss": 6.244938373565674, + "eval_runtime": 34.6306, + "eval_samples_per_second": 0.722, + "eval_steps_per_second": 0.722, + "step": 16614 + }, + { + "epoch": 71.00025641025641, + "grad_norm": 0.0021248271223157644, + "learning_rate": 3.2193732193732196e-06, + "loss": 0.7973, + "step": 16620 + }, + { + "epoch": 71.00068376068376, + "grad_norm": 0.00018868227198254317, + "learning_rate": 3.214624881291548e-06, + "loss": 0.0, + "step": 16630 + }, + { + "epoch": 71.00111111111111, + "grad_norm": 921.1409301757812, + "learning_rate": 3.2098765432098767e-06, + "loss": 0.3031, + "step": 16640 + }, + { + "epoch": 71.00153846153846, + "grad_norm": 0.0012714448384940624, + "learning_rate": 3.205128205128206e-06, + "loss": 0.0, + "step": 16650 + }, + { + "epoch": 71.00196581196582, + "grad_norm": 0.0014924101997166872, + "learning_rate": 3.200379867046534e-06, + "loss": 0.0079, + "step": 16660 + }, + { + "epoch": 71.00239316239316, + "grad_norm": 0.00040770156192593277, + "learning_rate": 3.195631528964863e-06, + "loss": 0.0018, + "step": 16670 + }, + { + "epoch": 71.0028205128205, + "grad_norm": 0.00022908284154254943, + "learning_rate": 3.190883190883191e-06, + "loss": 0.002, + "step": 16680 + }, + { + "epoch": 71.00324786324786, + "grad_norm": 0.0016254654619842768, + "learning_rate": 3.1861348528015194e-06, + "loss": 0.7552, + "step": 16690 + }, + { + "epoch": 71.00367521367521, + "grad_norm": 0.0006611282587982714, + "learning_rate": 3.181386514719848e-06, + "loss": 0.0972, + "step": 16700 + }, + { + "epoch": 71.00410256410257, + "grad_norm": 2.668363332748413, + "learning_rate": 3.1766381766381773e-06, + "loss": 0.0006, + "step": 16710 + }, + { + "epoch": 71.00452991452991, + "grad_norm": 0.0007531665614806116, + "learning_rate": 3.1718898385565056e-06, + "loss": 0.0, + "step": 16720 + }, + { + "epoch": 71.00495726495727, + "grad_norm": 0.0007412447594106197, + "learning_rate": 3.167141500474834e-06, + "loss": 0.0, + "step": 16730 + }, + { + "epoch": 71.00538461538461, + "grad_norm": 0.0008477046503685415, + "learning_rate": 3.1623931623931626e-06, + "loss": 0.9046, + "step": 16740 + }, + { + "epoch": 71.00581196581197, + "grad_norm": 146.11962890625, + "learning_rate": 3.157644824311491e-06, + "loss": 0.3816, + "step": 16750 + }, + { + "epoch": 71.00623931623932, + "grad_norm": 0.0002443613775540143, + "learning_rate": 3.15289648622982e-06, + "loss": 0.2428, + "step": 16760 + }, + { + "epoch": 71.00666666666666, + "grad_norm": 0.00033410079777240753, + "learning_rate": 3.1481481481481483e-06, + "loss": 0.0, + "step": 16770 + }, + { + "epoch": 71.00709401709402, + "grad_norm": 0.002586389658972621, + "learning_rate": 3.143399810066477e-06, + "loss": 0.8823, + "step": 16780 + }, + { + "epoch": 71.00752136752136, + "grad_norm": 0.001075676642358303, + "learning_rate": 3.1386514719848053e-06, + "loss": 0.0, + "step": 16790 + }, + { + "epoch": 71.00794871794872, + "grad_norm": 0.002573596313595772, + "learning_rate": 3.133903133903134e-06, + "loss": 0.0, + "step": 16800 + }, + { + "epoch": 71.00837606837607, + "grad_norm": 0.00022963988885749131, + "learning_rate": 3.1291547958214628e-06, + "loss": 0.0, + "step": 16810 + }, + { + "epoch": 71.00880341880342, + "grad_norm": 0.0005840660887770355, + "learning_rate": 3.1244064577397915e-06, + "loss": 0.0, + "step": 16820 + }, + { + "epoch": 71.00923076923077, + "grad_norm": 0.00029322231421247125, + "learning_rate": 3.11965811965812e-06, + "loss": 0.0, + "step": 16830 + }, + { + "epoch": 71.00965811965811, + "grad_norm": 0.00025132184964604676, + "learning_rate": 3.1149097815764485e-06, + "loss": 0.0001, + "step": 16840 + }, + { + "epoch": 71.01, + "eval_accuracy": 0.4, + "eval_loss": 6.252962589263916, + "eval_runtime": 33.4983, + "eval_samples_per_second": 0.746, + "eval_steps_per_second": 0.746, + "step": 16848 + }, + { + "epoch": 72.00008547008547, + "grad_norm": 0.0019571471493691206, + "learning_rate": 3.110161443494777e-06, + "loss": 0.0, + "step": 16850 + }, + { + "epoch": 72.00051282051282, + "grad_norm": 0.0005339854396879673, + "learning_rate": 3.105413105413106e-06, + "loss": 0.0001, + "step": 16860 + }, + { + "epoch": 72.00094017094017, + "grad_norm": 0.00019883255299646407, + "learning_rate": 3.1006647673314343e-06, + "loss": 0.0, + "step": 16870 + }, + { + "epoch": 72.00136752136753, + "grad_norm": 0.00024648249382153153, + "learning_rate": 3.095916429249763e-06, + "loss": 0.0003, + "step": 16880 + }, + { + "epoch": 72.00179487179487, + "grad_norm": 0.0015446147881448269, + "learning_rate": 3.0911680911680913e-06, + "loss": 0.0006, + "step": 16890 + }, + { + "epoch": 72.00222222222222, + "grad_norm": 0.0021758642978966236, + "learning_rate": 3.08641975308642e-06, + "loss": 0.0001, + "step": 16900 + }, + { + "epoch": 72.00264957264957, + "grad_norm": 0.0003914159897249192, + "learning_rate": 3.0816714150047487e-06, + "loss": 0.0, + "step": 16910 + }, + { + "epoch": 72.00307692307692, + "grad_norm": 0.0009236117475666106, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.0, + "step": 16920 + }, + { + "epoch": 72.00350427350428, + "grad_norm": 0.0002460191317368299, + "learning_rate": 3.0721747388414057e-06, + "loss": 0.0, + "step": 16930 + }, + { + "epoch": 72.00393162393162, + "grad_norm": 0.00033975299447774887, + "learning_rate": 3.0674264007597345e-06, + "loss": 0.0, + "step": 16940 + }, + { + "epoch": 72.00435897435898, + "grad_norm": 0.001101949717849493, + "learning_rate": 3.0626780626780627e-06, + "loss": 0.0, + "step": 16950 + }, + { + "epoch": 72.00478632478632, + "grad_norm": 0.0031472702976316214, + "learning_rate": 3.0579297245963915e-06, + "loss": 0.5291, + "step": 16960 + }, + { + "epoch": 72.00521367521368, + "grad_norm": 0.00030517615959979594, + "learning_rate": 3.05318138651472e-06, + "loss": 0.0, + "step": 16970 + }, + { + "epoch": 72.00564102564103, + "grad_norm": 0.0015221843495965004, + "learning_rate": 3.048433048433049e-06, + "loss": 0.0004, + "step": 16980 + }, + { + "epoch": 72.00606837606837, + "grad_norm": 0.0034548670519143343, + "learning_rate": 3.043684710351377e-06, + "loss": 0.0, + "step": 16990 + }, + { + "epoch": 72.00649572649573, + "grad_norm": 0.0006523687625303864, + "learning_rate": 3.038936372269706e-06, + "loss": 0.9208, + "step": 17000 + }, + { + "epoch": 72.00692307692307, + "grad_norm": 0.0015028053894639015, + "learning_rate": 3.0341880341880342e-06, + "loss": 0.0, + "step": 17010 + }, + { + "epoch": 72.00735042735043, + "grad_norm": 0.00025994013412855566, + "learning_rate": 3.0294396961063634e-06, + "loss": 0.0, + "step": 17020 + }, + { + "epoch": 72.00777777777778, + "grad_norm": 0.000636533775832504, + "learning_rate": 3.0246913580246917e-06, + "loss": 0.0303, + "step": 17030 + }, + { + "epoch": 72.00820512820513, + "grad_norm": 0.003830546513199806, + "learning_rate": 3.0199430199430204e-06, + "loss": 0.0001, + "step": 17040 + }, + { + "epoch": 72.00863247863248, + "grad_norm": 0.00017824105452746153, + "learning_rate": 3.0151946818613487e-06, + "loss": 0.0, + "step": 17050 + }, + { + "epoch": 72.00905982905982, + "grad_norm": 0.00022492320567835122, + "learning_rate": 3.010446343779677e-06, + "loss": 0.0, + "step": 17060 + }, + { + "epoch": 72.00948717948718, + "grad_norm": 0.0020875674672424793, + "learning_rate": 3.005698005698006e-06, + "loss": 0.0, + "step": 17070 + }, + { + "epoch": 72.00991452991452, + "grad_norm": 0.0006881671724840999, + "learning_rate": 3.000949667616335e-06, + "loss": 0.0, + "step": 17080 + }, + { + "epoch": 72.01, + "eval_accuracy": 0.4, + "eval_loss": 5.765503406524658, + "eval_runtime": 33.3353, + "eval_samples_per_second": 0.75, + "eval_steps_per_second": 0.75, + "step": 17082 + }, + { + "epoch": 73.00034188034188, + "grad_norm": 0.0012791932094842196, + "learning_rate": 2.996201329534663e-06, + "loss": 0.0, + "step": 17090 + }, + { + "epoch": 73.00076923076924, + "grad_norm": 0.00039805949199944735, + "learning_rate": 2.9914529914529914e-06, + "loss": 0.0, + "step": 17100 + }, + { + "epoch": 73.00119658119658, + "grad_norm": 0.0010755633702501655, + "learning_rate": 2.98670465337132e-06, + "loss": 0.0001, + "step": 17110 + }, + { + "epoch": 73.00162393162393, + "grad_norm": 0.07058065384626389, + "learning_rate": 2.9819563152896493e-06, + "loss": 0.0, + "step": 17120 + }, + { + "epoch": 73.00205128205128, + "grad_norm": 0.00041162193519994617, + "learning_rate": 2.9772079772079776e-06, + "loss": 0.0, + "step": 17130 + }, + { + "epoch": 73.00247863247863, + "grad_norm": 0.0027573970146477222, + "learning_rate": 2.972459639126306e-06, + "loss": 0.0, + "step": 17140 + }, + { + "epoch": 73.00290598290599, + "grad_norm": 0.0003702895774040371, + "learning_rate": 2.9677113010446346e-06, + "loss": 0.0, + "step": 17150 + }, + { + "epoch": 73.00333333333333, + "grad_norm": 0.0007125766715034842, + "learning_rate": 2.962962962962963e-06, + "loss": 0.0, + "step": 17160 + }, + { + "epoch": 73.00376068376069, + "grad_norm": 0.00024414356448687613, + "learning_rate": 2.9582146248812916e-06, + "loss": 0.0, + "step": 17170 + }, + { + "epoch": 73.00418803418803, + "grad_norm": 0.0009582286584191024, + "learning_rate": 2.9534662867996203e-06, + "loss": 0.0001, + "step": 17180 + }, + { + "epoch": 73.00461538461539, + "grad_norm": 0.00019269179028924555, + "learning_rate": 2.948717948717949e-06, + "loss": 0.0, + "step": 17190 + }, + { + "epoch": 73.00504273504274, + "grad_norm": 0.0013847124064341187, + "learning_rate": 2.9439696106362774e-06, + "loss": 0.0, + "step": 17200 + }, + { + "epoch": 73.00547008547008, + "grad_norm": 0.8285155892372131, + "learning_rate": 2.939221272554606e-06, + "loss": 0.0002, + "step": 17210 + }, + { + "epoch": 73.00589743589744, + "grad_norm": 0.0001845570222940296, + "learning_rate": 2.9344729344729344e-06, + "loss": 0.0, + "step": 17220 + }, + { + "epoch": 73.00632478632478, + "grad_norm": 561.99072265625, + "learning_rate": 2.9297245963912635e-06, + "loss": 0.6593, + "step": 17230 + }, + { + "epoch": 73.00675213675214, + "grad_norm": 0.17753881216049194, + "learning_rate": 2.924976258309592e-06, + "loss": 0.0001, + "step": 17240 + }, + { + "epoch": 73.00717948717949, + "grad_norm": 0.00024875238887034357, + "learning_rate": 2.9202279202279205e-06, + "loss": 0.0, + "step": 17250 + }, + { + "epoch": 73.00760683760684, + "grad_norm": 0.016402428969740868, + "learning_rate": 2.915479582146249e-06, + "loss": 0.0039, + "step": 17260 + }, + { + "epoch": 73.00803418803419, + "grad_norm": 0.001006082515232265, + "learning_rate": 2.9107312440645776e-06, + "loss": 0.0038, + "step": 17270 + }, + { + "epoch": 73.00846153846153, + "grad_norm": 0.0013266198802739382, + "learning_rate": 2.9059829059829063e-06, + "loss": 0.0113, + "step": 17280 + }, + { + "epoch": 73.00888888888889, + "grad_norm": 0.00038516995846293867, + "learning_rate": 2.901234567901235e-06, + "loss": 0.0, + "step": 17290 + }, + { + "epoch": 73.00931623931623, + "grad_norm": 0.0010296168038621545, + "learning_rate": 2.8964862298195633e-06, + "loss": 0.0, + "step": 17300 + }, + { + "epoch": 73.0097435897436, + "grad_norm": 0.0006145525840111077, + "learning_rate": 2.891737891737892e-06, + "loss": 0.0, + "step": 17310 + }, + { + "epoch": 73.01, + "eval_accuracy": 0.4, + "eval_loss": 6.152116775512695, + "eval_runtime": 33.4645, + "eval_samples_per_second": 0.747, + "eval_steps_per_second": 0.747, + "step": 17316 + }, + { + "epoch": 74.00017094017095, + "grad_norm": 0.00017263834888581187, + "learning_rate": 2.8869895536562203e-06, + "loss": 0.0, + "step": 17320 + }, + { + "epoch": 74.00059829059829, + "grad_norm": 0.00031588933779858053, + "learning_rate": 2.8822412155745495e-06, + "loss": 0.0034, + "step": 17330 + }, + { + "epoch": 74.00102564102563, + "grad_norm": 0.0006647444679401815, + "learning_rate": 2.8774928774928778e-06, + "loss": 0.0, + "step": 17340 + }, + { + "epoch": 74.001452991453, + "grad_norm": 0.0013857238227501512, + "learning_rate": 2.8727445394112065e-06, + "loss": 0.0004, + "step": 17350 + }, + { + "epoch": 74.00188034188034, + "grad_norm": 0.003892462467774749, + "learning_rate": 2.8679962013295348e-06, + "loss": 0.0002, + "step": 17360 + }, + { + "epoch": 74.0023076923077, + "grad_norm": 0.0004585685092024505, + "learning_rate": 2.8632478632478635e-06, + "loss": 0.0, + "step": 17370 + }, + { + "epoch": 74.00273504273504, + "grad_norm": 0.00025991059374064207, + "learning_rate": 2.858499525166192e-06, + "loss": 0.0008, + "step": 17380 + }, + { + "epoch": 74.0031623931624, + "grad_norm": 0.0004791081009898335, + "learning_rate": 2.853751187084521e-06, + "loss": 0.0354, + "step": 17390 + }, + { + "epoch": 74.00358974358974, + "grad_norm": 0.0005468902527354658, + "learning_rate": 2.8490028490028492e-06, + "loss": 0.0001, + "step": 17400 + }, + { + "epoch": 74.0040170940171, + "grad_norm": 0.01083172857761383, + "learning_rate": 2.844254510921178e-06, + "loss": 0.0, + "step": 17410 + }, + { + "epoch": 74.00444444444445, + "grad_norm": 0.0011809701099991798, + "learning_rate": 2.8395061728395062e-06, + "loss": 0.0, + "step": 17420 + }, + { + "epoch": 74.00487179487179, + "grad_norm": 0.0013473471626639366, + "learning_rate": 2.8347578347578345e-06, + "loss": 0.0, + "step": 17430 + }, + { + "epoch": 74.00529914529915, + "grad_norm": 0.00039711128920316696, + "learning_rate": 2.8300094966761637e-06, + "loss": 0.0, + "step": 17440 + }, + { + "epoch": 74.00572649572649, + "grad_norm": 0.00017134596419055015, + "learning_rate": 2.8252611585944924e-06, + "loss": 0.0, + "step": 17450 + }, + { + "epoch": 74.00615384615385, + "grad_norm": 0.00042395462514832616, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.0, + "step": 17460 + }, + { + "epoch": 74.0065811965812, + "grad_norm": 0.00016071122081484646, + "learning_rate": 2.815764482431149e-06, + "loss": 0.0, + "step": 17470 + }, + { + "epoch": 74.00700854700855, + "grad_norm": 0.20301441848278046, + "learning_rate": 2.8110161443494777e-06, + "loss": 0.0001, + "step": 17480 + }, + { + "epoch": 74.0074358974359, + "grad_norm": 0.0002695897710509598, + "learning_rate": 2.806267806267807e-06, + "loss": 0.0, + "step": 17490 + }, + { + "epoch": 74.00786324786324, + "grad_norm": 0.005475103389471769, + "learning_rate": 2.801519468186135e-06, + "loss": 0.0, + "step": 17500 + }, + { + "epoch": 74.0082905982906, + "grad_norm": 0.01020097080618143, + "learning_rate": 2.7967711301044635e-06, + "loss": 0.0, + "step": 17510 + }, + { + "epoch": 74.00871794871794, + "grad_norm": 0.00026164480368606746, + "learning_rate": 2.792022792022792e-06, + "loss": 0.0, + "step": 17520 + }, + { + "epoch": 74.0091452991453, + "grad_norm": 0.0005212483229115605, + "learning_rate": 2.7872744539411205e-06, + "loss": 0.0001, + "step": 17530 + }, + { + "epoch": 74.00957264957265, + "grad_norm": 0.00018099919543601573, + "learning_rate": 2.7825261158594496e-06, + "loss": 0.0, + "step": 17540 + }, + { + "epoch": 74.01, + "grad_norm": 0.00015604341751895845, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.0, + "step": 17550 + }, + { + "epoch": 74.01, + "eval_accuracy": 0.44, + "eval_loss": 6.159724235534668, + "eval_runtime": 33.1573, + "eval_samples_per_second": 0.754, + "eval_steps_per_second": 0.754, + "step": 17550 + }, + { + "epoch": 75.00042735042734, + "grad_norm": 0.00023313055862672627, + "learning_rate": 2.7730294396961066e-06, + "loss": 0.0, + "step": 17560 + }, + { + "epoch": 75.0008547008547, + "grad_norm": 0.002859926549717784, + "learning_rate": 2.768281101614435e-06, + "loss": 0.0, + "step": 17570 + }, + { + "epoch": 75.00128205128205, + "grad_norm": 0.002907276153564453, + "learning_rate": 2.7635327635327636e-06, + "loss": 0.0001, + "step": 17580 + }, + { + "epoch": 75.0017094017094, + "grad_norm": 0.0013060672208666801, + "learning_rate": 2.758784425451093e-06, + "loss": 0.0, + "step": 17590 + }, + { + "epoch": 75.00213675213675, + "grad_norm": 0.0003227469860576093, + "learning_rate": 2.754036087369421e-06, + "loss": 0.0, + "step": 17600 + }, + { + "epoch": 75.00256410256411, + "grad_norm": 0.0003190785355400294, + "learning_rate": 2.7492877492877494e-06, + "loss": 0.0, + "step": 17610 + }, + { + "epoch": 75.00299145299145, + "grad_norm": 0.00020427798153832555, + "learning_rate": 2.744539411206078e-06, + "loss": 0.0, + "step": 17620 + }, + { + "epoch": 75.00341880341881, + "grad_norm": 0.053999051451683044, + "learning_rate": 2.7397910731244064e-06, + "loss": 0.0001, + "step": 17630 + }, + { + "epoch": 75.00384615384615, + "grad_norm": 0.0002565534086897969, + "learning_rate": 2.7350427350427355e-06, + "loss": 0.0, + "step": 17640 + }, + { + "epoch": 75.0042735042735, + "grad_norm": 0.00017350871348753572, + "learning_rate": 2.730294396961064e-06, + "loss": 0.0, + "step": 17650 + }, + { + "epoch": 75.00470085470086, + "grad_norm": 0.00023242604220286012, + "learning_rate": 2.7255460588793926e-06, + "loss": 0.0, + "step": 17660 + }, + { + "epoch": 75.0051282051282, + "grad_norm": 0.00020732257689815015, + "learning_rate": 2.720797720797721e-06, + "loss": 0.0, + "step": 17670 + }, + { + "epoch": 75.00555555555556, + "grad_norm": 0.00024015163944568485, + "learning_rate": 2.7160493827160496e-06, + "loss": 0.0, + "step": 17680 + }, + { + "epoch": 75.0059829059829, + "grad_norm": 0.00017238871078006923, + "learning_rate": 2.711301044634378e-06, + "loss": 0.0, + "step": 17690 + }, + { + "epoch": 75.00641025641026, + "grad_norm": 0.0001814868883229792, + "learning_rate": 2.706552706552707e-06, + "loss": 0.0002, + "step": 17700 + }, + { + "epoch": 75.0068376068376, + "grad_norm": 0.00020034310000482947, + "learning_rate": 2.7018043684710353e-06, + "loss": 0.0, + "step": 17710 + }, + { + "epoch": 75.00726495726495, + "grad_norm": 0.004398710560053587, + "learning_rate": 2.697056030389364e-06, + "loss": 0.0, + "step": 17720 + }, + { + "epoch": 75.00769230769231, + "grad_norm": 0.00018758252554107457, + "learning_rate": 2.6923076923076923e-06, + "loss": 0.0, + "step": 17730 + }, + { + "epoch": 75.00811965811965, + "grad_norm": 0.0014624909963458776, + "learning_rate": 2.687559354226021e-06, + "loss": 0.0, + "step": 17740 + }, + { + "epoch": 75.00854700854701, + "grad_norm": 0.0002650118840392679, + "learning_rate": 2.6828110161443498e-06, + "loss": 0.0, + "step": 17750 + }, + { + "epoch": 75.00897435897436, + "grad_norm": 0.004331326577812433, + "learning_rate": 2.6780626780626785e-06, + "loss": 0.0, + "step": 17760 + }, + { + "epoch": 75.00940170940171, + "grad_norm": 0.0013256591046229005, + "learning_rate": 2.673314339981007e-06, + "loss": 0.0, + "step": 17770 + }, + { + "epoch": 75.00982905982906, + "grad_norm": 0.0001800288155209273, + "learning_rate": 2.6685660018993355e-06, + "loss": 0.6123, + "step": 17780 + }, + { + "epoch": 75.01, + "eval_accuracy": 0.4, + "eval_loss": 6.478565216064453, + "eval_runtime": 33.5224, + "eval_samples_per_second": 0.746, + "eval_steps_per_second": 0.746, + "step": 17784 + }, + { + "epoch": 76.00025641025641, + "grad_norm": 0.00023036974016577005, + "learning_rate": 2.663817663817664e-06, + "loss": 0.0, + "step": 17790 + }, + { + "epoch": 76.00068376068376, + "grad_norm": 0.0002549351193010807, + "learning_rate": 2.659069325735993e-06, + "loss": 0.0, + "step": 17800 + }, + { + "epoch": 76.00111111111111, + "grad_norm": 0.00015945105405990034, + "learning_rate": 2.6543209876543212e-06, + "loss": 0.0768, + "step": 17810 + }, + { + "epoch": 76.00153846153846, + "grad_norm": 0.00026457657804712653, + "learning_rate": 2.64957264957265e-06, + "loss": 0.0007, + "step": 17820 + }, + { + "epoch": 76.00196581196582, + "grad_norm": 0.0002729761472437531, + "learning_rate": 2.6448243114909783e-06, + "loss": 0.0, + "step": 17830 + }, + { + "epoch": 76.00239316239316, + "grad_norm": 0.00020036596106365323, + "learning_rate": 2.6400759734093066e-06, + "loss": 0.0, + "step": 17840 + }, + { + "epoch": 76.0028205128205, + "grad_norm": 0.00016762949235271662, + "learning_rate": 2.6353276353276357e-06, + "loss": 0.0, + "step": 17850 + }, + { + "epoch": 76.00324786324786, + "grad_norm": 0.0004586647264659405, + "learning_rate": 2.6305792972459644e-06, + "loss": 0.0, + "step": 17860 + }, + { + "epoch": 76.00367521367521, + "grad_norm": 0.006451745051890612, + "learning_rate": 2.6258309591642927e-06, + "loss": 0.0, + "step": 17870 + }, + { + "epoch": 76.00410256410257, + "grad_norm": 0.0031748872715979815, + "learning_rate": 2.6210826210826214e-06, + "loss": 0.0, + "step": 17880 + }, + { + "epoch": 76.00452991452991, + "grad_norm": 0.0012073371326550841, + "learning_rate": 2.6163342830009497e-06, + "loss": 0.0, + "step": 17890 + }, + { + "epoch": 76.00495726495727, + "grad_norm": 0.0001654605002840981, + "learning_rate": 2.611585944919278e-06, + "loss": 0.0, + "step": 17900 + }, + { + "epoch": 76.00538461538461, + "grad_norm": 0.0012388996547088027, + "learning_rate": 2.606837606837607e-06, + "loss": 0.0, + "step": 17910 + }, + { + "epoch": 76.00581196581197, + "grad_norm": 0.00017125860904343426, + "learning_rate": 2.602089268755936e-06, + "loss": 0.0, + "step": 17920 + }, + { + "epoch": 76.00623931623932, + "grad_norm": 0.004054773598909378, + "learning_rate": 2.597340930674264e-06, + "loss": 0.0001, + "step": 17930 + }, + { + "epoch": 76.00666666666666, + "grad_norm": 0.0019136080518364906, + "learning_rate": 2.5925925925925925e-06, + "loss": 0.0, + "step": 17940 + }, + { + "epoch": 76.00709401709402, + "grad_norm": 0.0007586510619148612, + "learning_rate": 2.587844254510921e-06, + "loss": 0.0, + "step": 17950 + }, + { + "epoch": 76.00752136752136, + "grad_norm": 0.0010209938045591116, + "learning_rate": 2.5830959164292504e-06, + "loss": 0.0004, + "step": 17960 + }, + { + "epoch": 76.00794871794872, + "grad_norm": 0.00018688829732127488, + "learning_rate": 2.5783475783475787e-06, + "loss": 0.0, + "step": 17970 + }, + { + "epoch": 76.00837606837607, + "grad_norm": 0.00018369669851381332, + "learning_rate": 2.573599240265907e-06, + "loss": 0.0, + "step": 17980 + }, + { + "epoch": 76.00880341880342, + "grad_norm": 0.0010952269658446312, + "learning_rate": 2.5688509021842357e-06, + "loss": 0.0, + "step": 17990 + }, + { + "epoch": 76.00923076923077, + "grad_norm": 0.00021538576402235776, + "learning_rate": 2.564102564102564e-06, + "loss": 0.0003, + "step": 18000 + }, + { + "epoch": 76.00965811965811, + "grad_norm": 0.0005051797488704324, + "learning_rate": 2.559354226020893e-06, + "loss": 0.0, + "step": 18010 + }, + { + "epoch": 76.01, + "eval_accuracy": 0.4, + "eval_loss": 6.552752494812012, + "eval_runtime": 33.6617, + "eval_samples_per_second": 0.743, + "eval_steps_per_second": 0.743, + "step": 18018 + }, + { + "epoch": 77.00008547008547, + "grad_norm": 0.000823773501906544, + "learning_rate": 2.5546058879392214e-06, + "loss": 0.0004, + "step": 18020 + }, + { + "epoch": 77.00051282051282, + "grad_norm": 0.001240988029167056, + "learning_rate": 2.54985754985755e-06, + "loss": 0.0, + "step": 18030 + }, + { + "epoch": 77.00094017094017, + "grad_norm": 0.0006408431218005717, + "learning_rate": 2.5451092117758784e-06, + "loss": 0.0001, + "step": 18040 + }, + { + "epoch": 77.00136752136753, + "grad_norm": 0.00021658137848135084, + "learning_rate": 2.540360873694207e-06, + "loss": 0.0, + "step": 18050 + }, + { + "epoch": 77.00179487179487, + "grad_norm": 0.00019429672101978213, + "learning_rate": 2.535612535612536e-06, + "loss": 0.0, + "step": 18060 + }, + { + "epoch": 77.00222222222222, + "grad_norm": 0.00022740900749340653, + "learning_rate": 2.5308641975308646e-06, + "loss": 0.0, + "step": 18070 + }, + { + "epoch": 77.00264957264957, + "grad_norm": 0.00015916908159852028, + "learning_rate": 2.526115859449193e-06, + "loss": 0.7221, + "step": 18080 + }, + { + "epoch": 77.00307692307692, + "grad_norm": 0.00014186595217324793, + "learning_rate": 2.5213675213675216e-06, + "loss": 0.0, + "step": 18090 + }, + { + "epoch": 77.00350427350428, + "grad_norm": 0.00016282236902043223, + "learning_rate": 2.51661918328585e-06, + "loss": 0.0, + "step": 18100 + }, + { + "epoch": 77.00393162393162, + "grad_norm": 0.0003063238982576877, + "learning_rate": 2.511870845204179e-06, + "loss": 0.0, + "step": 18110 + }, + { + "epoch": 77.00435897435898, + "grad_norm": 0.00018556939903646708, + "learning_rate": 2.5071225071225073e-06, + "loss": 0.0, + "step": 18120 + }, + { + "epoch": 77.00478632478632, + "grad_norm": 0.00014702802582178265, + "learning_rate": 2.502374169040836e-06, + "loss": 0.0, + "step": 18130 + }, + { + "epoch": 77.00521367521368, + "grad_norm": 0.004362730775028467, + "learning_rate": 2.4976258309591644e-06, + "loss": 0.0, + "step": 18140 + }, + { + "epoch": 77.00564102564103, + "grad_norm": 0.00015520227316301316, + "learning_rate": 2.492877492877493e-06, + "loss": 0.0013, + "step": 18150 + }, + { + "epoch": 77.00606837606837, + "grad_norm": 528.2244262695312, + "learning_rate": 2.488129154795822e-06, + "loss": 0.274, + "step": 18160 + }, + { + "epoch": 77.00649572649573, + "grad_norm": 0.00016038109606597573, + "learning_rate": 2.48338081671415e-06, + "loss": 0.0, + "step": 18170 + }, + { + "epoch": 77.00692307692307, + "grad_norm": 0.0002260198671137914, + "learning_rate": 2.478632478632479e-06, + "loss": 0.0022, + "step": 18180 + }, + { + "epoch": 77.00735042735043, + "grad_norm": 0.008054505102336407, + "learning_rate": 2.4738841405508075e-06, + "loss": 0.0718, + "step": 18190 + }, + { + "epoch": 77.00777777777778, + "grad_norm": 0.0019935595337301493, + "learning_rate": 2.469135802469136e-06, + "loss": 0.0, + "step": 18200 + }, + { + "epoch": 77.00820512820513, + "grad_norm": 0.0007075904286466539, + "learning_rate": 2.4643874643874645e-06, + "loss": 0.0, + "step": 18210 + }, + { + "epoch": 77.00863247863248, + "grad_norm": 0.0013958788476884365, + "learning_rate": 2.4596391263057933e-06, + "loss": 0.0, + "step": 18220 + }, + { + "epoch": 77.00905982905982, + "grad_norm": 0.0016203945269808173, + "learning_rate": 2.454890788224122e-06, + "loss": 0.0, + "step": 18230 + }, + { + "epoch": 77.00948717948718, + "grad_norm": 0.00014267765800468624, + "learning_rate": 2.4501424501424503e-06, + "loss": 0.0, + "step": 18240 + }, + { + "epoch": 77.00991452991452, + "grad_norm": 0.001626075361855328, + "learning_rate": 2.445394112060779e-06, + "loss": 0.0, + "step": 18250 + }, + { + "epoch": 77.01, + "eval_accuracy": 0.44, + "eval_loss": 5.542598724365234, + "eval_runtime": 33.0633, + "eval_samples_per_second": 0.756, + "eval_steps_per_second": 0.756, + "step": 18252 + }, + { + "epoch": 78.00034188034188, + "grad_norm": 0.0003114393330179155, + "learning_rate": 2.4406457739791077e-06, + "loss": 0.0, + "step": 18260 + }, + { + "epoch": 78.00076923076924, + "grad_norm": 0.00026152783539146185, + "learning_rate": 2.435897435897436e-06, + "loss": 0.0, + "step": 18270 + }, + { + "epoch": 78.00119658119658, + "grad_norm": 0.0003940732858609408, + "learning_rate": 2.4311490978157647e-06, + "loss": 0.0, + "step": 18280 + }, + { + "epoch": 78.00162393162393, + "grad_norm": 0.000839279149658978, + "learning_rate": 2.4264007597340935e-06, + "loss": 0.0059, + "step": 18290 + }, + { + "epoch": 78.00205128205128, + "grad_norm": 0.00048554647946730256, + "learning_rate": 2.4216524216524218e-06, + "loss": 0.0, + "step": 18300 + }, + { + "epoch": 78.00247863247863, + "grad_norm": 0.00023615985992364585, + "learning_rate": 2.4169040835707505e-06, + "loss": 0.0, + "step": 18310 + }, + { + "epoch": 78.00290598290599, + "grad_norm": 0.0001385568466503173, + "learning_rate": 2.412155745489079e-06, + "loss": 0.0, + "step": 18320 + }, + { + "epoch": 78.00333333333333, + "grad_norm": 0.0001412118726875633, + "learning_rate": 2.4074074074074075e-06, + "loss": 0.0, + "step": 18330 + }, + { + "epoch": 78.00376068376069, + "grad_norm": 0.0002054434735327959, + "learning_rate": 2.4026590693257362e-06, + "loss": 0.0, + "step": 18340 + }, + { + "epoch": 78.00418803418803, + "grad_norm": 0.0019602985121309757, + "learning_rate": 2.3979107312440645e-06, + "loss": 0.0, + "step": 18350 + }, + { + "epoch": 78.00461538461539, + "grad_norm": 0.00021056836703792214, + "learning_rate": 2.3931623931623937e-06, + "loss": 0.0, + "step": 18360 + }, + { + "epoch": 78.00504273504274, + "grad_norm": 40.98805618286133, + "learning_rate": 2.388414055080722e-06, + "loss": 0.0028, + "step": 18370 + }, + { + "epoch": 78.00547008547008, + "grad_norm": 0.00020283031335566193, + "learning_rate": 2.3836657169990502e-06, + "loss": 0.0, + "step": 18380 + }, + { + "epoch": 78.00589743589744, + "grad_norm": 0.00025577127235010266, + "learning_rate": 2.378917378917379e-06, + "loss": 0.0001, + "step": 18390 + }, + { + "epoch": 78.00632478632478, + "grad_norm": 0.00014881067909300327, + "learning_rate": 2.3741690408357077e-06, + "loss": 0.0002, + "step": 18400 + }, + { + "epoch": 78.00675213675214, + "grad_norm": 0.00021390440815594047, + "learning_rate": 2.3694207027540364e-06, + "loss": 0.0, + "step": 18410 + }, + { + "epoch": 78.00717948717949, + "grad_norm": 0.00022567427367903292, + "learning_rate": 2.3646723646723647e-06, + "loss": 0.0, + "step": 18420 + }, + { + "epoch": 78.00760683760684, + "grad_norm": 0.00015315061318688095, + "learning_rate": 2.3599240265906934e-06, + "loss": 0.0, + "step": 18430 + }, + { + "epoch": 78.00803418803419, + "grad_norm": 0.00031218226649798453, + "learning_rate": 2.355175688509022e-06, + "loss": 0.0009, + "step": 18440 + }, + { + "epoch": 78.00846153846153, + "grad_norm": 0.6312761902809143, + "learning_rate": 2.3504273504273504e-06, + "loss": 0.0002, + "step": 18450 + }, + { + "epoch": 78.00888888888889, + "grad_norm": 0.13864165544509888, + "learning_rate": 2.345679012345679e-06, + "loss": 0.0003, + "step": 18460 + }, + { + "epoch": 78.00931623931623, + "grad_norm": 0.0004563250986393541, + "learning_rate": 2.340930674264008e-06, + "loss": 0.0006, + "step": 18470 + }, + { + "epoch": 78.0097435897436, + "grad_norm": 0.0005144558963365853, + "learning_rate": 2.336182336182336e-06, + "loss": 0.0, + "step": 18480 + }, + { + "epoch": 78.01, + "eval_accuracy": 0.4, + "eval_loss": 6.4275970458984375, + "eval_runtime": 33.5576, + "eval_samples_per_second": 0.745, + "eval_steps_per_second": 0.745, + "step": 18486 + }, + { + "epoch": 79.00017094017095, + "grad_norm": 0.00022311207430902869, + "learning_rate": 2.331433998100665e-06, + "loss": 0.0038, + "step": 18490 + }, + { + "epoch": 79.00059829059829, + "grad_norm": 0.0002333286392968148, + "learning_rate": 2.3266856600189936e-06, + "loss": 0.0, + "step": 18500 + }, + { + "epoch": 79.00102564102563, + "grad_norm": 0.00024575545103289187, + "learning_rate": 2.321937321937322e-06, + "loss": 0.0001, + "step": 18510 + }, + { + "epoch": 79.001452991453, + "grad_norm": 0.0004118859360460192, + "learning_rate": 2.3171889838556506e-06, + "loss": 0.0, + "step": 18520 + }, + { + "epoch": 79.00188034188034, + "grad_norm": 0.0018699835054576397, + "learning_rate": 2.3124406457739794e-06, + "loss": 0.0, + "step": 18530 + }, + { + "epoch": 79.0023076923077, + "grad_norm": 0.0005015349015593529, + "learning_rate": 2.307692307692308e-06, + "loss": 0.0, + "step": 18540 + }, + { + "epoch": 79.00273504273504, + "grad_norm": 0.00027656828751787543, + "learning_rate": 2.3029439696106364e-06, + "loss": 0.0, + "step": 18550 + }, + { + "epoch": 79.0031623931624, + "grad_norm": 0.0010210457257926464, + "learning_rate": 2.298195631528965e-06, + "loss": 0.0, + "step": 18560 + }, + { + "epoch": 79.00358974358974, + "grad_norm": 0.0015956490533426404, + "learning_rate": 2.293447293447294e-06, + "loss": 0.0, + "step": 18570 + }, + { + "epoch": 79.0040170940171, + "grad_norm": 0.001694871811196208, + "learning_rate": 2.288698955365622e-06, + "loss": 0.0, + "step": 18580 + }, + { + "epoch": 79.00444444444445, + "grad_norm": 0.0004087260167580098, + "learning_rate": 2.283950617283951e-06, + "loss": 0.0, + "step": 18590 + }, + { + "epoch": 79.00487179487179, + "grad_norm": 0.00020924824639223516, + "learning_rate": 2.2792022792022796e-06, + "loss": 0.0, + "step": 18600 + }, + { + "epoch": 79.00529914529915, + "grad_norm": 0.0006689398433081806, + "learning_rate": 2.274453941120608e-06, + "loss": 0.0, + "step": 18610 + }, + { + "epoch": 79.00572649572649, + "grad_norm": 0.0007431868580169976, + "learning_rate": 2.2697056030389366e-06, + "loss": 0.5593, + "step": 18620 + }, + { + "epoch": 79.00615384615385, + "grad_norm": 0.00014568243932444602, + "learning_rate": 2.2649572649572653e-06, + "loss": 0.0001, + "step": 18630 + }, + { + "epoch": 79.0065811965812, + "grad_norm": 0.0003919856681022793, + "learning_rate": 2.2602089268755936e-06, + "loss": 0.0, + "step": 18640 + }, + { + "epoch": 79.00700854700855, + "grad_norm": 0.00014353814185597003, + "learning_rate": 2.2554605887939223e-06, + "loss": 0.0, + "step": 18650 + }, + { + "epoch": 79.0074358974359, + "grad_norm": 0.00022350714425556362, + "learning_rate": 2.250712250712251e-06, + "loss": 0.0, + "step": 18660 + }, + { + "epoch": 79.00786324786324, + "grad_norm": 0.0012620558263733983, + "learning_rate": 2.2459639126305797e-06, + "loss": 0.0012, + "step": 18670 + }, + { + "epoch": 79.0082905982906, + "grad_norm": 0.0014929536264389753, + "learning_rate": 2.241215574548908e-06, + "loss": 0.0, + "step": 18680 + }, + { + "epoch": 79.00871794871794, + "grad_norm": 0.0001310633379034698, + "learning_rate": 2.2364672364672368e-06, + "loss": 0.0, + "step": 18690 + }, + { + "epoch": 79.0091452991453, + "grad_norm": 2.5448873043060303, + "learning_rate": 2.2317188983855655e-06, + "loss": 0.0006, + "step": 18700 + }, + { + "epoch": 79.00957264957265, + "grad_norm": 0.0006994788418523967, + "learning_rate": 2.2269705603038938e-06, + "loss": 0.0, + "step": 18710 + }, + { + "epoch": 79.01, + "grad_norm": 0.00040815817192196846, + "learning_rate": 2.222222222222222e-06, + "loss": 0.0, + "step": 18720 + }, + { + "epoch": 79.01, + "eval_accuracy": 0.4, + "eval_loss": 6.867609024047852, + "eval_runtime": 33.9385, + "eval_samples_per_second": 0.737, + "eval_steps_per_second": 0.737, + "step": 18720 + }, + { + "epoch": 80.00042735042734, + "grad_norm": 0.0001611685729585588, + "learning_rate": 2.2174738841405512e-06, + "loss": 0.0, + "step": 18730 + }, + { + "epoch": 80.0008547008547, + "grad_norm": 0.00016659338143654168, + "learning_rate": 2.2127255460588795e-06, + "loss": 0.0, + "step": 18740 + }, + { + "epoch": 80.00128205128205, + "grad_norm": 0.0005343634402379394, + "learning_rate": 2.2079772079772082e-06, + "loss": 0.0, + "step": 18750 + }, + { + "epoch": 80.0017094017094, + "grad_norm": 0.00014189437206368893, + "learning_rate": 2.203228869895537e-06, + "loss": 0.0, + "step": 18760 + }, + { + "epoch": 80.00213675213675, + "grad_norm": 0.00016400543972849846, + "learning_rate": 2.1984805318138653e-06, + "loss": 0.0, + "step": 18770 + }, + { + "epoch": 80.00256410256411, + "grad_norm": 0.00013671614578925073, + "learning_rate": 2.193732193732194e-06, + "loss": 0.0, + "step": 18780 + }, + { + "epoch": 80.00299145299145, + "grad_norm": 0.0015447050100192428, + "learning_rate": 2.1889838556505223e-06, + "loss": 0.0, + "step": 18790 + }, + { + "epoch": 80.00341880341881, + "grad_norm": 0.00014961874694563448, + "learning_rate": 2.1842355175688514e-06, + "loss": 0.0, + "step": 18800 + }, + { + "epoch": 80.00384615384615, + "grad_norm": 0.000166281548445113, + "learning_rate": 2.1794871794871797e-06, + "loss": 0.2076, + "step": 18810 + }, + { + "epoch": 80.0042735042735, + "grad_norm": 0.0008461292018182576, + "learning_rate": 2.174738841405508e-06, + "loss": 0.0, + "step": 18820 + }, + { + "epoch": 80.00470085470086, + "grad_norm": 0.00018119644664693624, + "learning_rate": 2.1699905033238367e-06, + "loss": 1.0097, + "step": 18830 + }, + { + "epoch": 80.0051282051282, + "grad_norm": 0.0001796925498638302, + "learning_rate": 2.1652421652421654e-06, + "loss": 0.0, + "step": 18840 + }, + { + "epoch": 80.00555555555556, + "grad_norm": 0.0001590051979292184, + "learning_rate": 2.1604938271604937e-06, + "loss": 0.0917, + "step": 18850 + }, + { + "epoch": 80.0059829059829, + "grad_norm": 0.00016695998783688992, + "learning_rate": 2.1557454890788225e-06, + "loss": 0.0, + "step": 18860 + }, + { + "epoch": 80.00641025641026, + "grad_norm": 0.0002668647503014654, + "learning_rate": 2.150997150997151e-06, + "loss": 0.0, + "step": 18870 + }, + { + "epoch": 80.0068376068376, + "grad_norm": 0.0016600067028775811, + "learning_rate": 2.14624881291548e-06, + "loss": 0.0, + "step": 18880 + }, + { + "epoch": 80.00726495726495, + "grad_norm": 0.0004276032268535346, + "learning_rate": 2.141500474833808e-06, + "loss": 0.0, + "step": 18890 + }, + { + "epoch": 80.00769230769231, + "grad_norm": 0.5101644992828369, + "learning_rate": 2.136752136752137e-06, + "loss": 0.0001, + "step": 18900 + }, + { + "epoch": 80.00811965811965, + "grad_norm": 0.00014222673780750483, + "learning_rate": 2.1320037986704656e-06, + "loss": 0.0, + "step": 18910 + }, + { + "epoch": 80.00854700854701, + "grad_norm": 0.0010868561221286654, + "learning_rate": 2.127255460588794e-06, + "loss": 0.0, + "step": 18920 + }, + { + "epoch": 80.00897435897436, + "grad_norm": 0.0014827632112428546, + "learning_rate": 2.1225071225071227e-06, + "loss": 0.0, + "step": 18930 + }, + { + "epoch": 80.00940170940171, + "grad_norm": 0.00015382345009129494, + "learning_rate": 2.1177587844254514e-06, + "loss": 0.0, + "step": 18940 + }, + { + "epoch": 80.00982905982906, + "grad_norm": 0.00015482779417652637, + "learning_rate": 2.1130104463437797e-06, + "loss": 0.0, + "step": 18950 + }, + { + "epoch": 80.01, + "eval_accuracy": 0.4, + "eval_loss": 6.669287204742432, + "eval_runtime": 33.799, + "eval_samples_per_second": 0.74, + "eval_steps_per_second": 0.74, + "step": 18954 + }, + { + "epoch": 81.00025641025641, + "grad_norm": 0.00021314578771125525, + "learning_rate": 2.1082621082621084e-06, + "loss": 0.0002, + "step": 18960 + }, + { + "epoch": 81.00068376068376, + "grad_norm": 0.00015724689001217484, + "learning_rate": 2.103513770180437e-06, + "loss": 0.0, + "step": 18970 + }, + { + "epoch": 81.00111111111111, + "grad_norm": 0.00015798299864400178, + "learning_rate": 2.0987654320987654e-06, + "loss": 0.6005, + "step": 18980 + }, + { + "epoch": 81.00153846153846, + "grad_norm": 0.00015623167564626783, + "learning_rate": 2.094017094017094e-06, + "loss": 0.0, + "step": 18990 + }, + { + "epoch": 81.00196581196582, + "grad_norm": 0.001082591712474823, + "learning_rate": 2.089268755935423e-06, + "loss": 0.0, + "step": 19000 + }, + { + "epoch": 81.00239316239316, + "grad_norm": 0.00016879322356544435, + "learning_rate": 2.0845204178537516e-06, + "loss": 0.0001, + "step": 19010 + }, + { + "epoch": 81.0028205128205, + "grad_norm": 0.0006452444358728826, + "learning_rate": 2.07977207977208e-06, + "loss": 0.0, + "step": 19020 + }, + { + "epoch": 81.00324786324786, + "grad_norm": 0.00027353313635103405, + "learning_rate": 2.0750237416904086e-06, + "loss": 0.0, + "step": 19030 + }, + { + "epoch": 81.00367521367521, + "grad_norm": 0.0008052527555264533, + "learning_rate": 2.0702754036087373e-06, + "loss": 0.0, + "step": 19040 + }, + { + "epoch": 81.00410256410257, + "grad_norm": 0.00014285088400356472, + "learning_rate": 2.0655270655270656e-06, + "loss": 0.0, + "step": 19050 + }, + { + "epoch": 81.00452991452991, + "grad_norm": 0.00035150200710631907, + "learning_rate": 2.0607787274453943e-06, + "loss": 0.0013, + "step": 19060 + }, + { + "epoch": 81.00495726495727, + "grad_norm": 0.0010856334120035172, + "learning_rate": 2.056030389363723e-06, + "loss": 0.0, + "step": 19070 + }, + { + "epoch": 81.00538461538461, + "grad_norm": 0.0009435404208488762, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.0, + "step": 19080 + }, + { + "epoch": 81.00581196581197, + "grad_norm": 0.001047900877892971, + "learning_rate": 2.04653371320038e-06, + "loss": 0.0, + "step": 19090 + }, + { + "epoch": 81.00623931623932, + "grad_norm": 0.00044585313298739493, + "learning_rate": 2.0417853751187088e-06, + "loss": 0.0, + "step": 19100 + }, + { + "epoch": 81.00666666666666, + "grad_norm": 0.00014062630361877382, + "learning_rate": 2.037037037037037e-06, + "loss": 0.0, + "step": 19110 + }, + { + "epoch": 81.00709401709402, + "grad_norm": 0.00024204060900956392, + "learning_rate": 2.032288698955366e-06, + "loss": 0.0, + "step": 19120 + }, + { + "epoch": 81.00752136752136, + "grad_norm": 0.000514078012201935, + "learning_rate": 2.0275403608736945e-06, + "loss": 0.0128, + "step": 19130 + }, + { + "epoch": 81.00794871794872, + "grad_norm": 0.0003828479675576091, + "learning_rate": 2.0227920227920232e-06, + "loss": 0.0, + "step": 19140 + }, + { + "epoch": 81.00837606837607, + "grad_norm": 0.000210257523576729, + "learning_rate": 2.0180436847103515e-06, + "loss": 0.0, + "step": 19150 + }, + { + "epoch": 81.00880341880342, + "grad_norm": 0.00039357185596600175, + "learning_rate": 2.01329534662868e-06, + "loss": 0.0, + "step": 19160 + }, + { + "epoch": 81.00923076923077, + "grad_norm": 0.00014624725736211985, + "learning_rate": 2.008547008547009e-06, + "loss": 0.0, + "step": 19170 + }, + { + "epoch": 81.00965811965811, + "grad_norm": 0.0019884561188519, + "learning_rate": 2.0037986704653373e-06, + "loss": 0.0, + "step": 19180 + }, + { + "epoch": 81.01, + "eval_accuracy": 0.4, + "eval_loss": 6.791877269744873, + "eval_runtime": 33.6676, + "eval_samples_per_second": 0.743, + "eval_steps_per_second": 0.743, + "step": 19188 + }, + { + "epoch": 82.00008547008547, + "grad_norm": 0.00014335830928757787, + "learning_rate": 1.9990503323836656e-06, + "loss": 0.0, + "step": 19190 + }, + { + "epoch": 82.00051282051282, + "grad_norm": 0.00016715576930437237, + "learning_rate": 1.9943019943019947e-06, + "loss": 0.0, + "step": 19200 + }, + { + "epoch": 82.00094017094017, + "grad_norm": 0.00015401339624077082, + "learning_rate": 1.989553656220323e-06, + "loss": 0.0, + "step": 19210 + }, + { + "epoch": 82.00136752136753, + "grad_norm": 0.0009122826741077006, + "learning_rate": 1.9848053181386517e-06, + "loss": 0.0, + "step": 19220 + }, + { + "epoch": 82.00179487179487, + "grad_norm": 0.0039600650779902935, + "learning_rate": 1.98005698005698e-06, + "loss": 0.0, + "step": 19230 + }, + { + "epoch": 82.00222222222222, + "grad_norm": 0.00023766346566844732, + "learning_rate": 1.9753086419753087e-06, + "loss": 0.0, + "step": 19240 + }, + { + "epoch": 82.00264957264957, + "grad_norm": 0.0002676662988960743, + "learning_rate": 1.9705603038936375e-06, + "loss": 0.0, + "step": 19250 + }, + { + "epoch": 82.00307692307692, + "grad_norm": 0.00012970353418495506, + "learning_rate": 1.9658119658119658e-06, + "loss": 0.9915, + "step": 19260 + }, + { + "epoch": 82.00350427350428, + "grad_norm": 0.001931848586536944, + "learning_rate": 1.9610636277302945e-06, + "loss": 0.0, + "step": 19270 + }, + { + "epoch": 82.00393162393162, + "grad_norm": 0.00016302384028676897, + "learning_rate": 1.956315289648623e-06, + "loss": 0.0, + "step": 19280 + }, + { + "epoch": 82.00435897435898, + "grad_norm": 0.0006244336836971343, + "learning_rate": 1.9515669515669515e-06, + "loss": 0.0, + "step": 19290 + }, + { + "epoch": 82.00478632478632, + "grad_norm": 0.0013233753852546215, + "learning_rate": 1.9468186134852802e-06, + "loss": 0.0, + "step": 19300 + }, + { + "epoch": 82.00521367521368, + "grad_norm": 0.00016536538896616548, + "learning_rate": 1.942070275403609e-06, + "loss": 0.0, + "step": 19310 + }, + { + "epoch": 82.00564102564103, + "grad_norm": 0.002657419303432107, + "learning_rate": 1.9373219373219372e-06, + "loss": 0.0276, + "step": 19320 + }, + { + "epoch": 82.00606837606837, + "grad_norm": 0.000514014158397913, + "learning_rate": 1.932573599240266e-06, + "loss": 0.0, + "step": 19330 + }, + { + "epoch": 82.00649572649573, + "grad_norm": 0.000511766062118113, + "learning_rate": 1.9278252611585947e-06, + "loss": 0.0, + "step": 19340 + }, + { + "epoch": 82.00692307692307, + "grad_norm": 0.0024127494543790817, + "learning_rate": 1.9230769230769234e-06, + "loss": 0.0, + "step": 19350 + }, + { + "epoch": 82.00735042735043, + "grad_norm": 0.0032117695081979036, + "learning_rate": 1.9183285849952517e-06, + "loss": 0.0, + "step": 19360 + }, + { + "epoch": 82.00777777777778, + "grad_norm": 0.0002099874400300905, + "learning_rate": 1.9135802469135804e-06, + "loss": 0.0, + "step": 19370 + }, + { + "epoch": 82.00820512820513, + "grad_norm": 0.0007865933002904058, + "learning_rate": 1.908831908831909e-06, + "loss": 0.0, + "step": 19380 + }, + { + "epoch": 82.00863247863248, + "grad_norm": 0.000656930438708514, + "learning_rate": 1.9040835707502374e-06, + "loss": 0.0, + "step": 19390 + }, + { + "epoch": 82.00905982905982, + "grad_norm": 0.16030727326869965, + "learning_rate": 1.8993352326685664e-06, + "loss": 0.0001, + "step": 19400 + }, + { + "epoch": 82.00948717948718, + "grad_norm": 0.0001436812017345801, + "learning_rate": 1.8945868945868947e-06, + "loss": 0.0, + "step": 19410 + }, + { + "epoch": 82.00991452991452, + "grad_norm": 0.000723684614058584, + "learning_rate": 1.8898385565052232e-06, + "loss": 0.0, + "step": 19420 + }, + { + "epoch": 82.01, + "eval_accuracy": 0.4, + "eval_loss": 6.751951694488525, + "eval_runtime": 36.0289, + "eval_samples_per_second": 0.694, + "eval_steps_per_second": 0.694, + "step": 19422 + }, + { + "epoch": 83.00034188034188, + "grad_norm": 0.000560764514375478, + "learning_rate": 1.885090218423552e-06, + "loss": 0.0, + "step": 19430 + }, + { + "epoch": 83.00076923076924, + "grad_norm": 0.00022796269331593066, + "learning_rate": 1.8803418803418804e-06, + "loss": 0.0, + "step": 19440 + }, + { + "epoch": 83.00119658119658, + "grad_norm": 0.00013079910422675312, + "learning_rate": 1.875593542260209e-06, + "loss": 0.0, + "step": 19450 + }, + { + "epoch": 83.00162393162393, + "grad_norm": 0.0005907687591388822, + "learning_rate": 1.8708452041785376e-06, + "loss": 0.3814, + "step": 19460 + }, + { + "epoch": 83.00205128205128, + "grad_norm": 0.00020753988064825535, + "learning_rate": 1.8660968660968661e-06, + "loss": 0.0, + "step": 19470 + }, + { + "epoch": 83.00247863247863, + "grad_norm": 0.00022883056954015046, + "learning_rate": 1.8613485280151949e-06, + "loss": 0.0001, + "step": 19480 + }, + { + "epoch": 83.00290598290599, + "grad_norm": 0.0005515534430742264, + "learning_rate": 1.8566001899335234e-06, + "loss": 0.0003, + "step": 19490 + }, + { + "epoch": 83.00333333333333, + "grad_norm": 0.0006961479666642845, + "learning_rate": 1.8518518518518519e-06, + "loss": 0.0, + "step": 19500 + }, + { + "epoch": 83.00376068376069, + "grad_norm": 0.0002969226916320622, + "learning_rate": 1.8471035137701806e-06, + "loss": 0.0, + "step": 19510 + }, + { + "epoch": 83.00418803418803, + "grad_norm": 0.0003522407787386328, + "learning_rate": 1.842355175688509e-06, + "loss": 0.0, + "step": 19520 + }, + { + "epoch": 83.00461538461539, + "grad_norm": 0.0003604775993153453, + "learning_rate": 1.8376068376068378e-06, + "loss": 0.0, + "step": 19530 + }, + { + "epoch": 83.00504273504274, + "grad_norm": 0.00028395134722813964, + "learning_rate": 1.8328584995251663e-06, + "loss": 0.0022, + "step": 19540 + }, + { + "epoch": 83.00547008547008, + "grad_norm": 0.0031416804995387793, + "learning_rate": 1.8281101614434948e-06, + "loss": 0.0, + "step": 19550 + }, + { + "epoch": 83.00589743589744, + "grad_norm": 0.0014400221407413483, + "learning_rate": 1.8233618233618236e-06, + "loss": 0.0, + "step": 19560 + }, + { + "epoch": 83.00632478632478, + "grad_norm": 0.0030254279263317585, + "learning_rate": 1.818613485280152e-06, + "loss": 0.0, + "step": 19570 + }, + { + "epoch": 83.00675213675214, + "grad_norm": 0.00016171699098777026, + "learning_rate": 1.8138651471984806e-06, + "loss": 0.0, + "step": 19580 + }, + { + "epoch": 83.00717948717949, + "grad_norm": 0.3200160264968872, + "learning_rate": 1.8091168091168093e-06, + "loss": 0.0001, + "step": 19590 + }, + { + "epoch": 83.00760683760684, + "grad_norm": 0.00022861655452288687, + "learning_rate": 1.8043684710351378e-06, + "loss": 0.0006, + "step": 19600 + }, + { + "epoch": 83.00803418803419, + "grad_norm": 0.00018940556037705392, + "learning_rate": 1.7996201329534665e-06, + "loss": 0.0, + "step": 19610 + }, + { + "epoch": 83.00846153846153, + "grad_norm": 0.0001795957941794768, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0, + "step": 19620 + }, + { + "epoch": 83.00888888888889, + "grad_norm": 0.00021350740280468017, + "learning_rate": 1.7901234567901235e-06, + "loss": 0.0, + "step": 19630 + }, + { + "epoch": 83.00931623931623, + "grad_norm": 0.0026247838977724314, + "learning_rate": 1.7853751187084523e-06, + "loss": 0.0, + "step": 19640 + }, + { + "epoch": 83.0097435897436, + "grad_norm": 0.0006859219283796847, + "learning_rate": 1.7806267806267808e-06, + "loss": 0.0, + "step": 19650 + }, + { + "epoch": 83.01, + "eval_accuracy": 0.4, + "eval_loss": 6.7565107345581055, + "eval_runtime": 33.4174, + "eval_samples_per_second": 0.748, + "eval_steps_per_second": 0.748, + "step": 19656 + }, + { + "epoch": 84.00017094017095, + "grad_norm": 0.00018314612680114806, + "learning_rate": 1.7758784425451095e-06, + "loss": 0.0, + "step": 19660 + }, + { + "epoch": 84.00059829059829, + "grad_norm": 0.0016701172571629286, + "learning_rate": 1.771130104463438e-06, + "loss": 0.0001, + "step": 19670 + }, + { + "epoch": 84.00102564102563, + "grad_norm": 0.1983662247657776, + "learning_rate": 1.7663817663817665e-06, + "loss": 0.0, + "step": 19680 + }, + { + "epoch": 84.001452991453, + "grad_norm": 0.00020793025032617152, + "learning_rate": 1.7616334283000952e-06, + "loss": 0.0, + "step": 19690 + }, + { + "epoch": 84.00188034188034, + "grad_norm": 0.00018910918151959777, + "learning_rate": 1.7568850902184237e-06, + "loss": 0.0, + "step": 19700 + }, + { + "epoch": 84.0023076923077, + "grad_norm": 0.0018449919298291206, + "learning_rate": 1.7521367521367522e-06, + "loss": 0.0, + "step": 19710 + }, + { + "epoch": 84.00273504273504, + "grad_norm": 0.0001609017635928467, + "learning_rate": 1.747388414055081e-06, + "loss": 0.0, + "step": 19720 + }, + { + "epoch": 84.0031623931624, + "grad_norm": 0.00035862025106325746, + "learning_rate": 1.7426400759734095e-06, + "loss": 0.0, + "step": 19730 + }, + { + "epoch": 84.00358974358974, + "grad_norm": 0.0008477799710817635, + "learning_rate": 1.7378917378917382e-06, + "loss": 0.0, + "step": 19740 + }, + { + "epoch": 84.0040170940171, + "grad_norm": 0.0038533161859959364, + "learning_rate": 1.7331433998100667e-06, + "loss": 0.0, + "step": 19750 + }, + { + "epoch": 84.00444444444445, + "grad_norm": 0.0002816705673467368, + "learning_rate": 1.7283950617283952e-06, + "loss": 0.0, + "step": 19760 + }, + { + "epoch": 84.00487179487179, + "grad_norm": 0.000693616340868175, + "learning_rate": 1.723646723646724e-06, + "loss": 0.0, + "step": 19770 + }, + { + "epoch": 84.00529914529915, + "grad_norm": 0.00019760453142225742, + "learning_rate": 1.7188983855650524e-06, + "loss": 0.0001, + "step": 19780 + }, + { + "epoch": 84.00572649572649, + "grad_norm": 0.0001434326113667339, + "learning_rate": 1.7141500474833807e-06, + "loss": 0.0, + "step": 19790 + }, + { + "epoch": 84.00615384615385, + "grad_norm": 0.000697402167133987, + "learning_rate": 1.7094017094017097e-06, + "loss": 0.0, + "step": 19800 + }, + { + "epoch": 84.0065811965812, + "grad_norm": 0.0014031692408025265, + "learning_rate": 1.704653371320038e-06, + "loss": 0.0, + "step": 19810 + }, + { + "epoch": 84.00700854700855, + "grad_norm": 0.00014115864178165793, + "learning_rate": 1.6999050332383669e-06, + "loss": 0.0, + "step": 19820 + }, + { + "epoch": 84.0074358974359, + "grad_norm": 0.0013099861098453403, + "learning_rate": 1.6951566951566952e-06, + "loss": 0.0, + "step": 19830 + }, + { + "epoch": 84.00786324786324, + "grad_norm": 0.00024018825206439942, + "learning_rate": 1.6904083570750237e-06, + "loss": 0.0, + "step": 19840 + }, + { + "epoch": 84.0082905982906, + "grad_norm": 0.0007821761537343264, + "learning_rate": 1.6856600189933524e-06, + "loss": 0.0, + "step": 19850 + }, + { + "epoch": 84.00871794871794, + "grad_norm": 0.00013779080472886562, + "learning_rate": 1.680911680911681e-06, + "loss": 0.0, + "step": 19860 + }, + { + "epoch": 84.0091452991453, + "grad_norm": 0.0010995293268933892, + "learning_rate": 1.6761633428300099e-06, + "loss": 0.0, + "step": 19870 + }, + { + "epoch": 84.00957264957265, + "grad_norm": 0.0002249151439173147, + "learning_rate": 1.6714150047483382e-06, + "loss": 0.0, + "step": 19880 + }, + { + "epoch": 84.01, + "grad_norm": 0.00015639988123439252, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0, + "step": 19890 + }, + { + "epoch": 84.01, + "eval_accuracy": 0.4, + "eval_loss": 6.818636417388916, + "eval_runtime": 33.5438, + "eval_samples_per_second": 0.745, + "eval_steps_per_second": 0.745, + "step": 19890 + }, + { + "epoch": 85.00042735042734, + "grad_norm": 0.0001349115773336962, + "learning_rate": 1.6619183285849954e-06, + "loss": 0.0, + "step": 19900 + }, + { + "epoch": 85.0008547008547, + "grad_norm": 0.00012621526548173279, + "learning_rate": 1.6571699905033239e-06, + "loss": 0.0, + "step": 19910 + }, + { + "epoch": 85.00128205128205, + "grad_norm": 0.00013298312842380255, + "learning_rate": 1.6524216524216524e-06, + "loss": 0.0, + "step": 19920 + }, + { + "epoch": 85.0017094017094, + "grad_norm": 0.005015491507947445, + "learning_rate": 1.6476733143399811e-06, + "loss": 0.0, + "step": 19930 + }, + { + "epoch": 85.00213675213675, + "grad_norm": 0.00015077627904247493, + "learning_rate": 1.6429249762583096e-06, + "loss": 0.0, + "step": 19940 + }, + { + "epoch": 85.00256410256411, + "grad_norm": 0.000555673090275377, + "learning_rate": 1.6381766381766383e-06, + "loss": 0.0, + "step": 19950 + }, + { + "epoch": 85.00299145299145, + "grad_norm": 0.0001231930946232751, + "learning_rate": 1.6334283000949669e-06, + "loss": 0.0, + "step": 19960 + }, + { + "epoch": 85.00341880341881, + "grad_norm": 0.0003615697205532342, + "learning_rate": 1.6286799620132954e-06, + "loss": 0.0, + "step": 19970 + }, + { + "epoch": 85.00384615384615, + "grad_norm": 0.00024398553068749607, + "learning_rate": 1.623931623931624e-06, + "loss": 0.0, + "step": 19980 + }, + { + "epoch": 85.0042735042735, + "grad_norm": 0.0009278811048716307, + "learning_rate": 1.6191832858499526e-06, + "loss": 0.0, + "step": 19990 + }, + { + "epoch": 85.00470085470086, + "grad_norm": 0.0006217070040293038, + "learning_rate": 1.6144349477682813e-06, + "loss": 0.0, + "step": 20000 + }, + { + "epoch": 85.0051282051282, + "grad_norm": 0.000226405740249902, + "learning_rate": 1.6096866096866098e-06, + "loss": 0.0, + "step": 20010 + }, + { + "epoch": 85.00555555555556, + "grad_norm": 0.00026999341207556427, + "learning_rate": 1.6049382716049383e-06, + "loss": 0.0021, + "step": 20020 + }, + { + "epoch": 85.0059829059829, + "grad_norm": 0.00012980998144485056, + "learning_rate": 1.600189933523267e-06, + "loss": 0.0, + "step": 20030 + }, + { + "epoch": 85.00641025641026, + "grad_norm": 0.0006714918417856097, + "learning_rate": 1.5954415954415956e-06, + "loss": 0.0, + "step": 20040 + }, + { + "epoch": 85.0068376068376, + "grad_norm": 0.0013149407459422946, + "learning_rate": 1.590693257359924e-06, + "loss": 0.0, + "step": 20050 + }, + { + "epoch": 85.00726495726495, + "grad_norm": 0.00016666753799654543, + "learning_rate": 1.5859449192782528e-06, + "loss": 0.0, + "step": 20060 + }, + { + "epoch": 85.00769230769231, + "grad_norm": 0.0008010066230781376, + "learning_rate": 1.5811965811965813e-06, + "loss": 0.0557, + "step": 20070 + }, + { + "epoch": 85.00811965811965, + "grad_norm": 0.0002666155050974339, + "learning_rate": 1.57644824311491e-06, + "loss": 0.0, + "step": 20080 + }, + { + "epoch": 85.00854700854701, + "grad_norm": 0.00023051415337249637, + "learning_rate": 1.5716999050332385e-06, + "loss": 0.0, + "step": 20090 + }, + { + "epoch": 85.00897435897436, + "grad_norm": 0.0011181911686435342, + "learning_rate": 1.566951566951567e-06, + "loss": 0.0, + "step": 20100 + }, + { + "epoch": 85.00940170940171, + "grad_norm": 0.0008536745444871485, + "learning_rate": 1.5622032288698958e-06, + "loss": 0.1919, + "step": 20110 + }, + { + "epoch": 85.00982905982906, + "grad_norm": 0.0005208790535107255, + "learning_rate": 1.5574548907882243e-06, + "loss": 0.0, + "step": 20120 + }, + { + "epoch": 85.01, + "eval_accuracy": 0.4, + "eval_loss": 6.55494499206543, + "eval_runtime": 33.6803, + "eval_samples_per_second": 0.742, + "eval_steps_per_second": 0.742, + "step": 20124 + }, + { + "epoch": 86.00025641025641, + "grad_norm": 0.00014588376507163048, + "learning_rate": 1.552706552706553e-06, + "loss": 0.0, + "step": 20130 + }, + { + "epoch": 86.00068376068376, + "grad_norm": 0.00038126404979266226, + "learning_rate": 1.5479582146248815e-06, + "loss": 0.0, + "step": 20140 + }, + { + "epoch": 86.00111111111111, + "grad_norm": 0.0008430579327978194, + "learning_rate": 1.54320987654321e-06, + "loss": 0.0, + "step": 20150 + }, + { + "epoch": 86.00153846153846, + "grad_norm": 0.0002688311506062746, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.0, + "step": 20160 + }, + { + "epoch": 86.00196581196582, + "grad_norm": 0.00030692145810462534, + "learning_rate": 1.5337132003798672e-06, + "loss": 0.0, + "step": 20170 + }, + { + "epoch": 86.00239316239316, + "grad_norm": 0.00014298380119726062, + "learning_rate": 1.5289648622981957e-06, + "loss": 0.0, + "step": 20180 + }, + { + "epoch": 86.0028205128205, + "grad_norm": 0.0001439500047126785, + "learning_rate": 1.5242165242165245e-06, + "loss": 0.0, + "step": 20190 + }, + { + "epoch": 86.00324786324786, + "grad_norm": 0.0001333106920355931, + "learning_rate": 1.519468186134853e-06, + "loss": 0.0, + "step": 20200 + }, + { + "epoch": 86.00367521367521, + "grad_norm": 0.0004077716148458421, + "learning_rate": 1.5147198480531817e-06, + "loss": 0.0, + "step": 20210 + }, + { + "epoch": 86.00410256410257, + "grad_norm": 0.0005425411509349942, + "learning_rate": 1.5099715099715102e-06, + "loss": 0.0081, + "step": 20220 + }, + { + "epoch": 86.00452991452991, + "grad_norm": 0.0001421132474206388, + "learning_rate": 1.5052231718898385e-06, + "loss": 0.0, + "step": 20230 + }, + { + "epoch": 86.00495726495727, + "grad_norm": 0.0005388124845921993, + "learning_rate": 1.5004748338081674e-06, + "loss": 0.0002, + "step": 20240 + }, + { + "epoch": 86.00538461538461, + "grad_norm": 0.0001884761149995029, + "learning_rate": 1.4957264957264957e-06, + "loss": 0.0, + "step": 20250 + }, + { + "epoch": 86.00581196581197, + "grad_norm": 0.0015297207282856107, + "learning_rate": 1.4909781576448246e-06, + "loss": 0.0, + "step": 20260 + }, + { + "epoch": 86.00623931623932, + "grad_norm": 0.0007038248004391789, + "learning_rate": 1.486229819563153e-06, + "loss": 0.0, + "step": 20270 + }, + { + "epoch": 86.00666666666666, + "grad_norm": 0.00011914034985238686, + "learning_rate": 1.4814814814814815e-06, + "loss": 0.0, + "step": 20280 + }, + { + "epoch": 86.00709401709402, + "grad_norm": 0.0005224226042628288, + "learning_rate": 1.4767331433998102e-06, + "loss": 0.0, + "step": 20290 + }, + { + "epoch": 86.00752136752136, + "grad_norm": 0.00024032553483266383, + "learning_rate": 1.4719848053181387e-06, + "loss": 0.0, + "step": 20300 + }, + { + "epoch": 86.00794871794872, + "grad_norm": 0.0015607475070282817, + "learning_rate": 1.4672364672364672e-06, + "loss": 0.0, + "step": 20310 + }, + { + "epoch": 86.00837606837607, + "grad_norm": 0.000935015850700438, + "learning_rate": 1.462488129154796e-06, + "loss": 0.0, + "step": 20320 + }, + { + "epoch": 86.00880341880342, + "grad_norm": 0.0009676801273599267, + "learning_rate": 1.4577397910731244e-06, + "loss": 0.0, + "step": 20330 + }, + { + "epoch": 86.00923076923077, + "grad_norm": 0.0018314124317839742, + "learning_rate": 1.4529914529914531e-06, + "loss": 0.0, + "step": 20340 + }, + { + "epoch": 86.00965811965811, + "grad_norm": 0.00039655648288317025, + "learning_rate": 1.4482431149097816e-06, + "loss": 0.0, + "step": 20350 + }, + { + "epoch": 86.01, + "eval_accuracy": 0.4, + "eval_loss": 6.722276210784912, + "eval_runtime": 34.4477, + "eval_samples_per_second": 0.726, + "eval_steps_per_second": 0.726, + "step": 20358 + }, + { + "epoch": 87.00008547008547, + "grad_norm": 0.00019830497330985963, + "learning_rate": 1.4434947768281102e-06, + "loss": 0.0, + "step": 20360 + }, + { + "epoch": 87.00051282051282, + "grad_norm": 0.00021265774557832628, + "learning_rate": 1.4387464387464389e-06, + "loss": 0.0, + "step": 20370 + }, + { + "epoch": 87.00094017094017, + "grad_norm": 0.00013740244321525097, + "learning_rate": 1.4339981006647674e-06, + "loss": 0.0, + "step": 20380 + }, + { + "epoch": 87.00136752136753, + "grad_norm": 0.0008310623816214502, + "learning_rate": 1.429249762583096e-06, + "loss": 0.0, + "step": 20390 + }, + { + "epoch": 87.00179487179487, + "grad_norm": 0.00020110943296458572, + "learning_rate": 1.4245014245014246e-06, + "loss": 0.0, + "step": 20400 + }, + { + "epoch": 87.00222222222222, + "grad_norm": 0.0004781411844305694, + "learning_rate": 1.4197530864197531e-06, + "loss": 0.0, + "step": 20410 + }, + { + "epoch": 87.00264957264957, + "grad_norm": 0.0001626553712412715, + "learning_rate": 1.4150047483380818e-06, + "loss": 0.0, + "step": 20420 + }, + { + "epoch": 87.00307692307692, + "grad_norm": 0.00045715321903117, + "learning_rate": 1.4102564102564104e-06, + "loss": 0.0, + "step": 20430 + }, + { + "epoch": 87.00350427350428, + "grad_norm": 0.00021040246065240353, + "learning_rate": 1.4055080721747389e-06, + "loss": 0.0, + "step": 20440 + }, + { + "epoch": 87.00393162393162, + "grad_norm": 0.00011547945905476809, + "learning_rate": 1.4007597340930676e-06, + "loss": 0.0, + "step": 20450 + }, + { + "epoch": 87.00435897435898, + "grad_norm": 0.00020624134049285203, + "learning_rate": 1.396011396011396e-06, + "loss": 0.0, + "step": 20460 + }, + { + "epoch": 87.00478632478632, + "grad_norm": 0.0008065582369454205, + "learning_rate": 1.3912630579297248e-06, + "loss": 0.0, + "step": 20470 + }, + { + "epoch": 87.00521367521368, + "grad_norm": 0.00020026916172355413, + "learning_rate": 1.3865147198480533e-06, + "loss": 0.0, + "step": 20480 + }, + { + "epoch": 87.00564102564103, + "grad_norm": 0.00014089647447690368, + "learning_rate": 1.3817663817663818e-06, + "loss": 0.0, + "step": 20490 + }, + { + "epoch": 87.00606837606837, + "grad_norm": 0.0007202147389762104, + "learning_rate": 1.3770180436847105e-06, + "loss": 0.0, + "step": 20500 + }, + { + "epoch": 87.00649572649573, + "grad_norm": 0.0001863235083874315, + "learning_rate": 1.372269705603039e-06, + "loss": 0.0, + "step": 20510 + }, + { + "epoch": 87.00692307692307, + "grad_norm": 0.00013089961430523545, + "learning_rate": 1.3675213675213678e-06, + "loss": 0.0, + "step": 20520 + }, + { + "epoch": 87.00735042735043, + "grad_norm": 0.000519382010679692, + "learning_rate": 1.3627730294396963e-06, + "loss": 0.0, + "step": 20530 + }, + { + "epoch": 87.00777777777778, + "grad_norm": 0.00013078245683573186, + "learning_rate": 1.3580246913580248e-06, + "loss": 0.0, + "step": 20540 + }, + { + "epoch": 87.00820512820513, + "grad_norm": 0.0006038915598765016, + "learning_rate": 1.3532763532763535e-06, + "loss": 0.0, + "step": 20550 + }, + { + "epoch": 87.00863247863248, + "grad_norm": 0.0002612897951621562, + "learning_rate": 1.348528015194682e-06, + "loss": 0.0, + "step": 20560 + }, + { + "epoch": 87.00905982905982, + "grad_norm": 0.00011701687617460266, + "learning_rate": 1.3437796771130105e-06, + "loss": 0.3334, + "step": 20570 + }, + { + "epoch": 87.00948717948718, + "grad_norm": 0.0023586824536323547, + "learning_rate": 1.3390313390313392e-06, + "loss": 0.0, + "step": 20580 + }, + { + "epoch": 87.00991452991452, + "grad_norm": 0.00032859586644917727, + "learning_rate": 1.3342830009496678e-06, + "loss": 0.0, + "step": 20590 + }, + { + "epoch": 87.01, + "eval_accuracy": 0.4, + "eval_loss": 6.909550189971924, + "eval_runtime": 34.96, + "eval_samples_per_second": 0.715, + "eval_steps_per_second": 0.715, + "step": 20592 + }, + { + "epoch": 88.00034188034188, + "grad_norm": 0.00037892567343078554, + "learning_rate": 1.3295346628679965e-06, + "loss": 0.0, + "step": 20600 + }, + { + "epoch": 88.00076923076924, + "grad_norm": 0.00016175136261153966, + "learning_rate": 1.324786324786325e-06, + "loss": 0.0, + "step": 20610 + }, + { + "epoch": 88.00119658119658, + "grad_norm": 0.00011514942161738873, + "learning_rate": 1.3200379867046533e-06, + "loss": 0.0, + "step": 20620 + }, + { + "epoch": 88.00162393162393, + "grad_norm": 0.000466162251541391, + "learning_rate": 1.3152896486229822e-06, + "loss": 0.0, + "step": 20630 + }, + { + "epoch": 88.00205128205128, + "grad_norm": 0.0005339948693290353, + "learning_rate": 1.3105413105413107e-06, + "loss": 0.0, + "step": 20640 + }, + { + "epoch": 88.00247863247863, + "grad_norm": 0.00011431486200308427, + "learning_rate": 1.305792972459639e-06, + "loss": 0.0, + "step": 20650 + }, + { + "epoch": 88.00290598290599, + "grad_norm": 0.00010979742364725098, + "learning_rate": 1.301044634377968e-06, + "loss": 0.0, + "step": 20660 + }, + { + "epoch": 88.00333333333333, + "grad_norm": 0.001283150864765048, + "learning_rate": 1.2962962962962962e-06, + "loss": 0.0, + "step": 20670 + }, + { + "epoch": 88.00376068376069, + "grad_norm": 0.0002451012551318854, + "learning_rate": 1.2915479582146252e-06, + "loss": 0.0, + "step": 20680 + }, + { + "epoch": 88.00418803418803, + "grad_norm": 0.00014794590242672712, + "learning_rate": 1.2867996201329535e-06, + "loss": 0.0, + "step": 20690 + }, + { + "epoch": 88.00461538461539, + "grad_norm": 0.0005672202096320689, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0, + "step": 20700 + }, + { + "epoch": 88.00504273504274, + "grad_norm": 0.00013605415006168187, + "learning_rate": 1.2773029439696107e-06, + "loss": 0.0, + "step": 20710 + }, + { + "epoch": 88.00547008547008, + "grad_norm": 0.00011451664613559842, + "learning_rate": 1.2725546058879392e-06, + "loss": 0.0, + "step": 20720 + }, + { + "epoch": 88.00589743589744, + "grad_norm": 0.00019140506628900766, + "learning_rate": 1.267806267806268e-06, + "loss": 0.0, + "step": 20730 + }, + { + "epoch": 88.00632478632478, + "grad_norm": 0.00013691678759641945, + "learning_rate": 1.2630579297245964e-06, + "loss": 0.0, + "step": 20740 + }, + { + "epoch": 88.00675213675214, + "grad_norm": 0.00016725384921301156, + "learning_rate": 1.258309591642925e-06, + "loss": 0.0, + "step": 20750 + }, + { + "epoch": 88.00717948717949, + "grad_norm": 0.00016475172014907002, + "learning_rate": 1.2535612535612537e-06, + "loss": 0.0, + "step": 20760 + }, + { + "epoch": 88.00760683760684, + "grad_norm": 0.00015170658298302442, + "learning_rate": 1.2488129154795822e-06, + "loss": 0.0, + "step": 20770 + }, + { + "epoch": 88.00803418803419, + "grad_norm": 0.012962790206074715, + "learning_rate": 1.244064577397911e-06, + "loss": 0.0, + "step": 20780 + }, + { + "epoch": 88.00846153846153, + "grad_norm": 0.00019268876349087805, + "learning_rate": 1.2393162393162394e-06, + "loss": 0.0, + "step": 20790 + }, + { + "epoch": 88.00888888888889, + "grad_norm": 0.00012487791536841542, + "learning_rate": 1.234567901234568e-06, + "loss": 0.0, + "step": 20800 + }, + { + "epoch": 88.00931623931623, + "grad_norm": 0.009372721426188946, + "learning_rate": 1.2298195631528966e-06, + "loss": 0.0, + "step": 20810 + }, + { + "epoch": 88.0097435897436, + "grad_norm": 0.0001522890670457855, + "learning_rate": 1.2250712250712251e-06, + "loss": 0.0, + "step": 20820 + }, + { + "epoch": 88.01, + "eval_accuracy": 0.4, + "eval_loss": 6.991762638092041, + "eval_runtime": 34.8658, + "eval_samples_per_second": 0.717, + "eval_steps_per_second": 0.717, + "step": 20826 + }, + { + "epoch": 89.00017094017095, + "grad_norm": 1.7047139406204224, + "learning_rate": 1.2203228869895539e-06, + "loss": 0.0003, + "step": 20830 + }, + { + "epoch": 89.00059829059829, + "grad_norm": 0.004476090893149376, + "learning_rate": 1.2155745489078824e-06, + "loss": 0.0, + "step": 20840 + }, + { + "epoch": 89.00102564102563, + "grad_norm": 0.0009234125609509647, + "learning_rate": 1.2108262108262109e-06, + "loss": 0.0, + "step": 20850 + }, + { + "epoch": 89.001452991453, + "grad_norm": 0.0005792377050966024, + "learning_rate": 1.2060778727445396e-06, + "loss": 0.0, + "step": 20860 + }, + { + "epoch": 89.00188034188034, + "grad_norm": 0.0001534123730380088, + "learning_rate": 1.2013295346628681e-06, + "loss": 0.0, + "step": 20870 + }, + { + "epoch": 89.0023076923077, + "grad_norm": 0.0013493632432073355, + "learning_rate": 1.1965811965811968e-06, + "loss": 0.0, + "step": 20880 + }, + { + "epoch": 89.00273504273504, + "grad_norm": 0.00017192041559610516, + "learning_rate": 1.1918328584995251e-06, + "loss": 0.0, + "step": 20890 + }, + { + "epoch": 89.0031623931624, + "grad_norm": 0.00012882999726571143, + "learning_rate": 1.1870845204178538e-06, + "loss": 0.0, + "step": 20900 + }, + { + "epoch": 89.00358974358974, + "grad_norm": 0.0013170551974326372, + "learning_rate": 1.1823361823361824e-06, + "loss": 0.0, + "step": 20910 + }, + { + "epoch": 89.0040170940171, + "grad_norm": 0.0004404323117341846, + "learning_rate": 1.177587844254511e-06, + "loss": 0.0, + "step": 20920 + }, + { + "epoch": 89.00444444444445, + "grad_norm": 0.00011723486386472359, + "learning_rate": 1.1728395061728396e-06, + "loss": 0.0015, + "step": 20930 + }, + { + "epoch": 89.00487179487179, + "grad_norm": 0.00012327823787927628, + "learning_rate": 1.168091168091168e-06, + "loss": 0.0, + "step": 20940 + }, + { + "epoch": 89.00529914529915, + "grad_norm": 0.00013142797979526222, + "learning_rate": 1.1633428300094968e-06, + "loss": 0.0, + "step": 20950 + }, + { + "epoch": 89.00572649572649, + "grad_norm": 0.0001497309422120452, + "learning_rate": 1.1585944919278253e-06, + "loss": 0.0, + "step": 20960 + }, + { + "epoch": 89.00615384615385, + "grad_norm": 0.0004792478575836867, + "learning_rate": 1.153846153846154e-06, + "loss": 0.0, + "step": 20970 + }, + { + "epoch": 89.0065811965812, + "grad_norm": 0.0001388894597766921, + "learning_rate": 1.1490978157644825e-06, + "loss": 0.0, + "step": 20980 + }, + { + "epoch": 89.00700854700855, + "grad_norm": 0.00023399751808028668, + "learning_rate": 1.144349477682811e-06, + "loss": 0.0, + "step": 20990 + }, + { + "epoch": 89.0074358974359, + "grad_norm": 0.00014475570060312748, + "learning_rate": 1.1396011396011398e-06, + "loss": 0.0, + "step": 21000 + }, + { + "epoch": 89.00786324786324, + "grad_norm": 0.00012018286361126229, + "learning_rate": 1.1348528015194683e-06, + "loss": 0.0, + "step": 21010 + }, + { + "epoch": 89.0082905982906, + "grad_norm": 0.00017406410188414156, + "learning_rate": 1.1301044634377968e-06, + "loss": 0.0002, + "step": 21020 + }, + { + "epoch": 89.00871794871794, + "grad_norm": 0.0001829592656577006, + "learning_rate": 1.1253561253561255e-06, + "loss": 0.0, + "step": 21030 + }, + { + "epoch": 89.0091452991453, + "grad_norm": 0.0005391041049733758, + "learning_rate": 1.120607787274454e-06, + "loss": 0.0, + "step": 21040 + }, + { + "epoch": 89.00957264957265, + "grad_norm": 0.00044315739069133997, + "learning_rate": 1.1158594491927827e-06, + "loss": 0.0, + "step": 21050 + }, + { + "epoch": 89.01, + "grad_norm": 0.00018860511772800237, + "learning_rate": 1.111111111111111e-06, + "loss": 0.0, + "step": 21060 + }, + { + "epoch": 89.01, + "eval_accuracy": 0.4, + "eval_loss": 7.22465705871582, + "eval_runtime": 35.0687, + "eval_samples_per_second": 0.713, + "eval_steps_per_second": 0.713, + "step": 21060 + }, + { + "epoch": 90.00042735042734, + "grad_norm": 0.00021203322103247046, + "learning_rate": 1.1063627730294398e-06, + "loss": 0.0, + "step": 21070 + }, + { + "epoch": 90.0008547008547, + "grad_norm": 0.00013690422929357737, + "learning_rate": 1.1016144349477685e-06, + "loss": 0.0, + "step": 21080 + }, + { + "epoch": 90.00128205128205, + "grad_norm": 0.0015827484894543886, + "learning_rate": 1.096866096866097e-06, + "loss": 0.6422, + "step": 21090 + }, + { + "epoch": 90.0017094017094, + "grad_norm": 0.00015395709488075227, + "learning_rate": 1.0921177587844257e-06, + "loss": 0.0, + "step": 21100 + }, + { + "epoch": 90.00213675213675, + "grad_norm": 0.0011581846047192812, + "learning_rate": 1.087369420702754e-06, + "loss": 0.0, + "step": 21110 + }, + { + "epoch": 90.00256410256411, + "grad_norm": 0.00010031823330791667, + "learning_rate": 1.0826210826210827e-06, + "loss": 0.0, + "step": 21120 + }, + { + "epoch": 90.00299145299145, + "grad_norm": 0.00012422981671988964, + "learning_rate": 1.0778727445394112e-06, + "loss": 0.0, + "step": 21130 + }, + { + "epoch": 90.00341880341881, + "grad_norm": 0.009594716131687164, + "learning_rate": 1.07312440645774e-06, + "loss": 0.0, + "step": 21140 + }, + { + "epoch": 90.00384615384615, + "grad_norm": 0.010623271577060223, + "learning_rate": 1.0683760683760685e-06, + "loss": 0.0, + "step": 21150 + }, + { + "epoch": 90.0042735042735, + "grad_norm": 0.0004154906782787293, + "learning_rate": 1.063627730294397e-06, + "loss": 0.0, + "step": 21160 + }, + { + "epoch": 90.00470085470086, + "grad_norm": 0.0005235851858742535, + "learning_rate": 1.0588793922127257e-06, + "loss": 0.0, + "step": 21170 + }, + { + "epoch": 90.0051282051282, + "grad_norm": 0.0006127028027549386, + "learning_rate": 1.0541310541310542e-06, + "loss": 0.0, + "step": 21180 + }, + { + "epoch": 90.00555555555556, + "grad_norm": 0.00012018286361126229, + "learning_rate": 1.0493827160493827e-06, + "loss": 0.0, + "step": 21190 + }, + { + "epoch": 90.0059829059829, + "grad_norm": 0.00017059137462638319, + "learning_rate": 1.0446343779677114e-06, + "loss": 0.0, + "step": 21200 + }, + { + "epoch": 90.00641025641026, + "grad_norm": 0.00011609737703111023, + "learning_rate": 1.03988603988604e-06, + "loss": 0.0012, + "step": 21210 + }, + { + "epoch": 90.0068376068376, + "grad_norm": 0.00016542985395062715, + "learning_rate": 1.0351377018043687e-06, + "loss": 0.0001, + "step": 21220 + }, + { + "epoch": 90.00726495726495, + "grad_norm": 0.22980256378650665, + "learning_rate": 1.0303893637226972e-06, + "loss": 0.0001, + "step": 21230 + }, + { + "epoch": 90.00769230769231, + "grad_norm": 0.0003457261191215366, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.0, + "step": 21240 + }, + { + "epoch": 90.00811965811965, + "grad_norm": 0.0006093020201660693, + "learning_rate": 1.0208926875593544e-06, + "loss": 0.0, + "step": 21250 + }, + { + "epoch": 90.00854700854701, + "grad_norm": 0.00011088740575360134, + "learning_rate": 1.016144349477683e-06, + "loss": 0.0, + "step": 21260 + }, + { + "epoch": 90.00897435897436, + "grad_norm": 0.0004857526218984276, + "learning_rate": 1.0113960113960116e-06, + "loss": 0.0, + "step": 21270 + }, + { + "epoch": 90.00940170940171, + "grad_norm": 0.00013570766896009445, + "learning_rate": 1.00664767331434e-06, + "loss": 0.0, + "step": 21280 + }, + { + "epoch": 90.00982905982906, + "grad_norm": 0.0014373520389199257, + "learning_rate": 1.0018993352326686e-06, + "loss": 0.0001, + "step": 21290 + }, + { + "epoch": 90.01, + "eval_accuracy": 0.4, + "eval_loss": 7.226686954498291, + "eval_runtime": 37.4838, + "eval_samples_per_second": 0.667, + "eval_steps_per_second": 0.667, + "step": 21294 + }, + { + "epoch": 91.00025641025641, + "grad_norm": 0.00011334038572385907, + "learning_rate": 9.971509971509974e-07, + "loss": 0.0, + "step": 21300 + }, + { + "epoch": 91.00068376068376, + "grad_norm": 0.00015362763951998204, + "learning_rate": 9.924026590693259e-07, + "loss": 0.0, + "step": 21310 + }, + { + "epoch": 91.00111111111111, + "grad_norm": 0.0003192702424712479, + "learning_rate": 9.876543209876544e-07, + "loss": 0.0, + "step": 21320 + }, + { + "epoch": 91.00153846153846, + "grad_norm": 0.0005757631151936948, + "learning_rate": 9.829059829059829e-07, + "loss": 0.0, + "step": 21330 + }, + { + "epoch": 91.00196581196582, + "grad_norm": 0.0004981070524081588, + "learning_rate": 9.781576448243116e-07, + "loss": 0.0, + "step": 21340 + }, + { + "epoch": 91.00239316239316, + "grad_norm": 0.000131376669742167, + "learning_rate": 9.734093067426401e-07, + "loss": 0.0, + "step": 21350 + }, + { + "epoch": 91.0028205128205, + "grad_norm": 1261.06396484375, + "learning_rate": 9.686609686609686e-07, + "loss": 0.3664, + "step": 21360 + }, + { + "epoch": 91.00324786324786, + "grad_norm": 0.00013327557826414704, + "learning_rate": 9.639126305792973e-07, + "loss": 0.0, + "step": 21370 + }, + { + "epoch": 91.00367521367521, + "grad_norm": 0.0001816957665141672, + "learning_rate": 9.591642924976258e-07, + "loss": 0.0, + "step": 21380 + }, + { + "epoch": 91.00410256410257, + "grad_norm": 0.004342993255704641, + "learning_rate": 9.544159544159546e-07, + "loss": 0.0, + "step": 21390 + }, + { + "epoch": 91.00452991452991, + "grad_norm": 0.0002144295140169561, + "learning_rate": 9.496676163342832e-07, + "loss": 0.0, + "step": 21400 + }, + { + "epoch": 91.00495726495727, + "grad_norm": 0.0005280747427605093, + "learning_rate": 9.449192782526116e-07, + "loss": 0.0001, + "step": 21410 + }, + { + "epoch": 91.00538461538461, + "grad_norm": 0.0002650852547958493, + "learning_rate": 9.401709401709402e-07, + "loss": 0.0, + "step": 21420 + }, + { + "epoch": 91.00581196581197, + "grad_norm": 0.0003367229946888983, + "learning_rate": 9.354226020892688e-07, + "loss": 0.0, + "step": 21430 + }, + { + "epoch": 91.00623931623932, + "grad_norm": 0.00010731956717791036, + "learning_rate": 9.306742640075974e-07, + "loss": 0.0, + "step": 21440 + }, + { + "epoch": 91.00666666666666, + "grad_norm": 0.00010221748379990458, + "learning_rate": 9.259259259259259e-07, + "loss": 0.0, + "step": 21450 + }, + { + "epoch": 91.00709401709402, + "grad_norm": 0.00013446349475998431, + "learning_rate": 9.211775878442545e-07, + "loss": 0.0, + "step": 21460 + }, + { + "epoch": 91.00752136752136, + "grad_norm": 0.00018187399837188423, + "learning_rate": 9.164292497625832e-07, + "loss": 0.0, + "step": 21470 + }, + { + "epoch": 91.00794871794872, + "grad_norm": 0.00015760202950332314, + "learning_rate": 9.116809116809118e-07, + "loss": 0.0, + "step": 21480 + }, + { + "epoch": 91.00837606837607, + "grad_norm": 0.00026204713503830135, + "learning_rate": 9.069325735992403e-07, + "loss": 0.0, + "step": 21490 + }, + { + "epoch": 91.00880341880342, + "grad_norm": 0.00038499056245200336, + "learning_rate": 9.021842355175689e-07, + "loss": 0.0, + "step": 21500 + }, + { + "epoch": 91.00923076923077, + "grad_norm": 0.000741164549253881, + "learning_rate": 8.974358974358975e-07, + "loss": 0.0, + "step": 21510 + }, + { + "epoch": 91.00965811965811, + "grad_norm": 0.00012043473543599248, + "learning_rate": 8.926875593542261e-07, + "loss": 0.0, + "step": 21520 + }, + { + "epoch": 91.01, + "eval_accuracy": 0.4, + "eval_loss": 6.982587814331055, + "eval_runtime": 37.2182, + "eval_samples_per_second": 0.672, + "eval_steps_per_second": 0.672, + "step": 21528 + }, + { + "epoch": 92.00008547008547, + "grad_norm": 0.0001411245611961931, + "learning_rate": 8.879392212725547e-07, + "loss": 0.0, + "step": 21530 + }, + { + "epoch": 92.00051282051282, + "grad_norm": 0.0017277189763262868, + "learning_rate": 8.831908831908833e-07, + "loss": 0.0, + "step": 21540 + }, + { + "epoch": 92.00094017094017, + "grad_norm": 0.00011866500426549464, + "learning_rate": 8.784425451092119e-07, + "loss": 0.0, + "step": 21550 + }, + { + "epoch": 92.00136752136753, + "grad_norm": 0.00031862963805906475, + "learning_rate": 8.736942070275405e-07, + "loss": 0.0, + "step": 21560 + }, + { + "epoch": 92.00179487179487, + "grad_norm": 0.00011709554382832721, + "learning_rate": 8.689458689458691e-07, + "loss": 0.0, + "step": 21570 + }, + { + "epoch": 92.00222222222222, + "grad_norm": 0.0005032451008446515, + "learning_rate": 8.641975308641976e-07, + "loss": 0.0, + "step": 21580 + }, + { + "epoch": 92.00264957264957, + "grad_norm": 0.0010287113254889846, + "learning_rate": 8.594491927825262e-07, + "loss": 0.0, + "step": 21590 + }, + { + "epoch": 92.00307692307692, + "grad_norm": 0.0004769675724674016, + "learning_rate": 8.547008547008548e-07, + "loss": 0.0, + "step": 21600 + }, + { + "epoch": 92.00350427350428, + "grad_norm": 0.00032892791205085814, + "learning_rate": 8.499525166191834e-07, + "loss": 0.0, + "step": 21610 + }, + { + "epoch": 92.00393162393162, + "grad_norm": 0.0001310220395680517, + "learning_rate": 8.452041785375118e-07, + "loss": 0.0, + "step": 21620 + }, + { + "epoch": 92.00435897435898, + "grad_norm": 0.0010653804056346416, + "learning_rate": 8.404558404558405e-07, + "loss": 0.0, + "step": 21630 + }, + { + "epoch": 92.00478632478632, + "grad_norm": 0.00020127870084252208, + "learning_rate": 8.357075023741691e-07, + "loss": 0.0, + "step": 21640 + }, + { + "epoch": 92.00521367521368, + "grad_norm": 0.0025258706882596016, + "learning_rate": 8.309591642924977e-07, + "loss": 0.0, + "step": 21650 + }, + { + "epoch": 92.00564102564103, + "grad_norm": 0.00015467533376067877, + "learning_rate": 8.262108262108262e-07, + "loss": 0.9533, + "step": 21660 + }, + { + "epoch": 92.00606837606837, + "grad_norm": 0.00014860654482617974, + "learning_rate": 8.214624881291548e-07, + "loss": 0.0, + "step": 21670 + }, + { + "epoch": 92.00649572649573, + "grad_norm": 0.0005292973946779966, + "learning_rate": 8.167141500474834e-07, + "loss": 0.0, + "step": 21680 + }, + { + "epoch": 92.00692307692307, + "grad_norm": 0.00015186695964075625, + "learning_rate": 8.11965811965812e-07, + "loss": 0.0, + "step": 21690 + }, + { + "epoch": 92.00735042735043, + "grad_norm": 0.00021409436885733157, + "learning_rate": 8.072174738841407e-07, + "loss": 0.0, + "step": 21700 + }, + { + "epoch": 92.00777777777778, + "grad_norm": 0.0007851360714994371, + "learning_rate": 8.024691358024692e-07, + "loss": 0.0, + "step": 21710 + }, + { + "epoch": 92.00820512820513, + "grad_norm": 0.00014987800386734307, + "learning_rate": 7.977207977207978e-07, + "loss": 0.0, + "step": 21720 + }, + { + "epoch": 92.00863247863248, + "grad_norm": 0.00021006350289098918, + "learning_rate": 7.929724596391264e-07, + "loss": 0.0001, + "step": 21730 + }, + { + "epoch": 92.00905982905982, + "grad_norm": 0.00011848520807689056, + "learning_rate": 7.88224121557455e-07, + "loss": 0.0, + "step": 21740 + }, + { + "epoch": 92.00948717948718, + "grad_norm": 0.0001388939272146672, + "learning_rate": 7.834757834757835e-07, + "loss": 0.0, + "step": 21750 + }, + { + "epoch": 92.00991452991452, + "grad_norm": 0.00011886980064446107, + "learning_rate": 7.787274453941121e-07, + "loss": 0.0, + "step": 21760 + }, + { + "epoch": 92.01, + "eval_accuracy": 0.4, + "eval_loss": 6.638452529907227, + "eval_runtime": 36.9317, + "eval_samples_per_second": 0.677, + "eval_steps_per_second": 0.677, + "step": 21762 + }, + { + "epoch": 93.00034188034188, + "grad_norm": 0.00011863345571327955, + "learning_rate": 7.739791073124407e-07, + "loss": 0.0, + "step": 21770 + }, + { + "epoch": 93.00076923076924, + "grad_norm": 0.000148111954331398, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0, + "step": 21780 + }, + { + "epoch": 93.00119658119658, + "grad_norm": 0.0006991415284574032, + "learning_rate": 7.644824311490979e-07, + "loss": 0.0, + "step": 21790 + }, + { + "epoch": 93.00162393162393, + "grad_norm": 0.006693857256323099, + "learning_rate": 7.597340930674265e-07, + "loss": 0.0, + "step": 21800 + }, + { + "epoch": 93.00205128205128, + "grad_norm": 0.0005863769329153001, + "learning_rate": 7.549857549857551e-07, + "loss": 0.0, + "step": 21810 + }, + { + "epoch": 93.00247863247863, + "grad_norm": 0.00011994949454674497, + "learning_rate": 7.502374169040837e-07, + "loss": 0.0, + "step": 21820 + }, + { + "epoch": 93.00290598290599, + "grad_norm": 0.00011668866500258446, + "learning_rate": 7.454890788224123e-07, + "loss": 0.0001, + "step": 21830 + }, + { + "epoch": 93.00333333333333, + "grad_norm": 0.00011160215217387304, + "learning_rate": 7.407407407407407e-07, + "loss": 0.0, + "step": 21840 + }, + { + "epoch": 93.00376068376069, + "grad_norm": 0.0001440770720364526, + "learning_rate": 7.359924026590693e-07, + "loss": 0.0, + "step": 21850 + }, + { + "epoch": 93.00418803418803, + "grad_norm": 0.00012972467811778188, + "learning_rate": 7.31244064577398e-07, + "loss": 0.0, + "step": 21860 + }, + { + "epoch": 93.00461538461539, + "grad_norm": 0.0002495836524758488, + "learning_rate": 7.264957264957266e-07, + "loss": 0.0, + "step": 21870 + }, + { + "epoch": 93.00504273504274, + "grad_norm": 0.00011603141319938004, + "learning_rate": 7.217473884140551e-07, + "loss": 0.0, + "step": 21880 + }, + { + "epoch": 93.00547008547008, + "grad_norm": 0.0005587959312833846, + "learning_rate": 7.169990503323837e-07, + "loss": 0.0, + "step": 21890 + }, + { + "epoch": 93.00589743589744, + "grad_norm": 0.00015568920935038477, + "learning_rate": 7.122507122507123e-07, + "loss": 0.0, + "step": 21900 + }, + { + "epoch": 93.00632478632478, + "grad_norm": 0.00015569105744361877, + "learning_rate": 7.075023741690409e-07, + "loss": 0.0, + "step": 21910 + }, + { + "epoch": 93.00675213675214, + "grad_norm": 0.00011201628512935713, + "learning_rate": 7.027540360873694e-07, + "loss": 0.0, + "step": 21920 + }, + { + "epoch": 93.00717948717949, + "grad_norm": 0.0012152871349826455, + "learning_rate": 6.98005698005698e-07, + "loss": 0.0, + "step": 21930 + }, + { + "epoch": 93.00760683760684, + "grad_norm": 0.00011229109804844484, + "learning_rate": 6.932573599240267e-07, + "loss": 0.0, + "step": 21940 + }, + { + "epoch": 93.00803418803419, + "grad_norm": 0.0010557627538219094, + "learning_rate": 6.885090218423553e-07, + "loss": 0.0, + "step": 21950 + }, + { + "epoch": 93.00846153846153, + "grad_norm": 0.0005080616101622581, + "learning_rate": 6.837606837606839e-07, + "loss": 0.0, + "step": 21960 + }, + { + "epoch": 93.00888888888889, + "grad_norm": 0.00011757034371839836, + "learning_rate": 6.790123456790124e-07, + "loss": 0.0001, + "step": 21970 + }, + { + "epoch": 93.00931623931623, + "grad_norm": 0.00048101996071636677, + "learning_rate": 6.74264007597341e-07, + "loss": 0.0, + "step": 21980 + }, + { + "epoch": 93.0097435897436, + "grad_norm": 0.00011343754886183888, + "learning_rate": 6.695156695156696e-07, + "loss": 0.792, + "step": 21990 + }, + { + "epoch": 93.01, + "eval_accuracy": 0.4, + "eval_loss": 6.402031898498535, + "eval_runtime": 34.6678, + "eval_samples_per_second": 0.721, + "eval_steps_per_second": 0.721, + "step": 21996 + }, + { + "epoch": 94.00017094017095, + "grad_norm": 0.00010894170554820448, + "learning_rate": 6.647673314339982e-07, + "loss": 0.0, + "step": 22000 + }, + { + "epoch": 94.00059829059829, + "grad_norm": 0.00013790665252599865, + "learning_rate": 6.600189933523266e-07, + "loss": 0.0, + "step": 22010 + }, + { + "epoch": 94.00102564102563, + "grad_norm": 0.00014016970817465335, + "learning_rate": 6.552706552706554e-07, + "loss": 0.0, + "step": 22020 + }, + { + "epoch": 94.001452991453, + "grad_norm": 0.002228393219411373, + "learning_rate": 6.50522317188984e-07, + "loss": 0.0, + "step": 22030 + }, + { + "epoch": 94.00188034188034, + "grad_norm": 0.00015126651851460338, + "learning_rate": 6.457739791073126e-07, + "loss": 0.0, + "step": 22040 + }, + { + "epoch": 94.0023076923077, + "grad_norm": 0.008348544128239155, + "learning_rate": 6.41025641025641e-07, + "loss": 0.0, + "step": 22050 + }, + { + "epoch": 94.00273504273504, + "grad_norm": 0.00011175816325703636, + "learning_rate": 6.362773029439696e-07, + "loss": 0.0001, + "step": 22060 + }, + { + "epoch": 94.0031623931624, + "grad_norm": 0.00012174518633401021, + "learning_rate": 6.315289648622982e-07, + "loss": 0.0, + "step": 22070 + }, + { + "epoch": 94.00358974358974, + "grad_norm": 0.00026770823751576245, + "learning_rate": 6.267806267806268e-07, + "loss": 0.0, + "step": 22080 + }, + { + "epoch": 94.0040170940171, + "grad_norm": 0.00019762790179811418, + "learning_rate": 6.220322886989554e-07, + "loss": 0.0158, + "step": 22090 + }, + { + "epoch": 94.00444444444445, + "grad_norm": 0.00017857948841992766, + "learning_rate": 6.17283950617284e-07, + "loss": 0.0001, + "step": 22100 + }, + { + "epoch": 94.00487179487179, + "grad_norm": 0.00011318692850181833, + "learning_rate": 6.125356125356126e-07, + "loss": 0.0, + "step": 22110 + }, + { + "epoch": 94.00529914529915, + "grad_norm": 1054.87451171875, + "learning_rate": 6.077872744539412e-07, + "loss": 0.1906, + "step": 22120 + }, + { + "epoch": 94.00572649572649, + "grad_norm": 0.0002740290074143559, + "learning_rate": 6.030389363722698e-07, + "loss": 0.0, + "step": 22130 + }, + { + "epoch": 94.00615384615385, + "grad_norm": 0.00017096682859119028, + "learning_rate": 5.982905982905984e-07, + "loss": 0.0, + "step": 22140 + }, + { + "epoch": 94.0065811965812, + "grad_norm": 0.00021798326633870602, + "learning_rate": 5.935422602089269e-07, + "loss": 0.0, + "step": 22150 + }, + { + "epoch": 94.00700854700855, + "grad_norm": 0.00105681037530303, + "learning_rate": 5.887939221272555e-07, + "loss": 0.0, + "step": 22160 + }, + { + "epoch": 94.0074358974359, + "grad_norm": 0.0005219983286224306, + "learning_rate": 5.84045584045584e-07, + "loss": 0.0, + "step": 22170 + }, + { + "epoch": 94.00786324786324, + "grad_norm": 0.00013317124103195965, + "learning_rate": 5.792972459639127e-07, + "loss": 0.0, + "step": 22180 + }, + { + "epoch": 94.0082905982906, + "grad_norm": 0.00028317165561020374, + "learning_rate": 5.745489078822413e-07, + "loss": 0.0, + "step": 22190 + }, + { + "epoch": 94.00871794871794, + "grad_norm": 0.00024831280461512506, + "learning_rate": 5.698005698005699e-07, + "loss": 0.0, + "step": 22200 + }, + { + "epoch": 94.0091452991453, + "grad_norm": 24.371335983276367, + "learning_rate": 5.650522317188984e-07, + "loss": 0.0029, + "step": 22210 + }, + { + "epoch": 94.00957264957265, + "grad_norm": 0.0005085449665784836, + "learning_rate": 5.60303893637227e-07, + "loss": 0.0, + "step": 22220 + }, + { + "epoch": 94.01, + "grad_norm": 0.00015020875434856862, + "learning_rate": 5.555555555555555e-07, + "loss": 0.0, + "step": 22230 + }, + { + "epoch": 94.01, + "eval_accuracy": 0.4, + "eval_loss": 6.445286273956299, + "eval_runtime": 36.5582, + "eval_samples_per_second": 0.684, + "eval_steps_per_second": 0.684, + "step": 22230 + }, + { + "epoch": 95.00042735042734, + "grad_norm": 0.00011259275925112888, + "learning_rate": 5.508072174738842e-07, + "loss": 0.0, + "step": 22240 + }, + { + "epoch": 95.0008547008547, + "grad_norm": 0.00021879606356378645, + "learning_rate": 5.460588793922129e-07, + "loss": 0.0, + "step": 22250 + }, + { + "epoch": 95.00128205128205, + "grad_norm": 0.00042616037535481155, + "learning_rate": 5.413105413105414e-07, + "loss": 0.0, + "step": 22260 + }, + { + "epoch": 95.0017094017094, + "grad_norm": 0.00019269147014711052, + "learning_rate": 5.3656220322887e-07, + "loss": 0.0, + "step": 22270 + }, + { + "epoch": 95.00213675213675, + "grad_norm": 0.00035289855441078544, + "learning_rate": 5.318138651471985e-07, + "loss": 0.0, + "step": 22280 + }, + { + "epoch": 95.00256410256411, + "grad_norm": 0.00012624288501683623, + "learning_rate": 5.270655270655271e-07, + "loss": 0.0, + "step": 22290 + }, + { + "epoch": 95.00299145299145, + "grad_norm": 0.00018576315778773278, + "learning_rate": 5.223171889838557e-07, + "loss": 0.0, + "step": 22300 + }, + { + "epoch": 95.00341880341881, + "grad_norm": 0.000108227992313914, + "learning_rate": 5.175688509021843e-07, + "loss": 0.0002, + "step": 22310 + }, + { + "epoch": 95.00384615384615, + "grad_norm": 0.0001419824839103967, + "learning_rate": 5.128205128205128e-07, + "loss": 0.0, + "step": 22320 + }, + { + "epoch": 95.0042735042735, + "grad_norm": 0.0004882781649939716, + "learning_rate": 5.080721747388414e-07, + "loss": 0.0, + "step": 22330 + }, + { + "epoch": 95.00470085470086, + "grad_norm": 0.0001368028315482661, + "learning_rate": 5.0332383665717e-07, + "loss": 0.0, + "step": 22340 + }, + { + "epoch": 95.0051282051282, + "grad_norm": 0.0009662008378654718, + "learning_rate": 4.985754985754987e-07, + "loss": 0.0, + "step": 22350 + }, + { + "epoch": 95.00555555555556, + "grad_norm": 0.0002917400561273098, + "learning_rate": 4.938271604938272e-07, + "loss": 0.0, + "step": 22360 + }, + { + "epoch": 95.0059829059829, + "grad_norm": 0.00049038470024243, + "learning_rate": 4.890788224121558e-07, + "loss": 0.0, + "step": 22370 + }, + { + "epoch": 95.00641025641026, + "grad_norm": 0.00018329080194234848, + "learning_rate": 4.843304843304843e-07, + "loss": 0.0, + "step": 22380 + }, + { + "epoch": 95.0068376068376, + "grad_norm": 0.00012125585635658354, + "learning_rate": 4.795821462488129e-07, + "loss": 0.0, + "step": 22390 + }, + { + "epoch": 95.00726495726495, + "grad_norm": 0.00016512990987394005, + "learning_rate": 4.748338081671416e-07, + "loss": 0.0, + "step": 22400 + }, + { + "epoch": 95.00769230769231, + "grad_norm": 0.0014239164302125573, + "learning_rate": 4.700854700854701e-07, + "loss": 0.0, + "step": 22410 + }, + { + "epoch": 95.00811965811965, + "grad_norm": 0.011379273608326912, + "learning_rate": 4.653371320037987e-07, + "loss": 0.0, + "step": 22420 + }, + { + "epoch": 95.00854700854701, + "grad_norm": 0.0001255527458852157, + "learning_rate": 4.605887939221273e-07, + "loss": 0.0, + "step": 22430 + }, + { + "epoch": 95.00897435897436, + "grad_norm": 0.00010715150710893795, + "learning_rate": 4.558404558404559e-07, + "loss": 0.0, + "step": 22440 + }, + { + "epoch": 95.00940170940171, + "grad_norm": 0.0006061216117814183, + "learning_rate": 4.5109211775878445e-07, + "loss": 0.0009, + "step": 22450 + }, + { + "epoch": 95.00982905982906, + "grad_norm": 0.0009303994593210518, + "learning_rate": 4.4634377967711306e-07, + "loss": 0.0, + "step": 22460 + }, + { + "epoch": 95.01, + "eval_accuracy": 0.4, + "eval_loss": 6.910160064697266, + "eval_runtime": 36.7216, + "eval_samples_per_second": 0.681, + "eval_steps_per_second": 0.681, + "step": 22464 + }, + { + "epoch": 96.00025641025641, + "grad_norm": 0.00010953476885333657, + "learning_rate": 4.415954415954416e-07, + "loss": 0.0, + "step": 22470 + }, + { + "epoch": 96.00068376068376, + "grad_norm": 0.0005640748422592878, + "learning_rate": 4.3684710351377024e-07, + "loss": 0.0, + "step": 22480 + }, + { + "epoch": 96.00111111111111, + "grad_norm": 0.00019859550229739398, + "learning_rate": 4.320987654320988e-07, + "loss": 0.0, + "step": 22490 + }, + { + "epoch": 96.00153846153846, + "grad_norm": 0.00011656359856715426, + "learning_rate": 4.273504273504274e-07, + "loss": 0.0, + "step": 22500 + }, + { + "epoch": 96.00196581196582, + "grad_norm": 0.010922577232122421, + "learning_rate": 4.226020892687559e-07, + "loss": 0.0, + "step": 22510 + }, + { + "epoch": 96.00239316239316, + "grad_norm": 0.0017669210210442543, + "learning_rate": 4.1785375118708454e-07, + "loss": 0.0, + "step": 22520 + }, + { + "epoch": 96.0028205128205, + "grad_norm": 9.618465264793485e-05, + "learning_rate": 4.131054131054131e-07, + "loss": 0.0, + "step": 22530 + }, + { + "epoch": 96.00324786324786, + "grad_norm": 0.00018471429939381778, + "learning_rate": 4.083570750237417e-07, + "loss": 0.0, + "step": 22540 + }, + { + "epoch": 96.00367521367521, + "grad_norm": 0.00048411861644126475, + "learning_rate": 4.0360873694207033e-07, + "loss": 0.0, + "step": 22550 + }, + { + "epoch": 96.00410256410257, + "grad_norm": 0.00014513205678667873, + "learning_rate": 3.988603988603989e-07, + "loss": 0.0, + "step": 22560 + }, + { + "epoch": 96.00452991452991, + "grad_norm": 0.00017483255942352116, + "learning_rate": 3.941120607787275e-07, + "loss": 0.0, + "step": 22570 + }, + { + "epoch": 96.00495726495727, + "grad_norm": 0.0007405714131891727, + "learning_rate": 3.8936372269705607e-07, + "loss": 0.0, + "step": 22580 + }, + { + "epoch": 96.00538461538461, + "grad_norm": 0.00010351594391977414, + "learning_rate": 3.846153846153847e-07, + "loss": 0.0, + "step": 22590 + }, + { + "epoch": 96.00581196581197, + "grad_norm": 0.00017762374773155898, + "learning_rate": 3.7986704653371324e-07, + "loss": 0.0, + "step": 22600 + }, + { + "epoch": 96.00623931623932, + "grad_norm": 0.0008599003194831312, + "learning_rate": 3.7511870845204186e-07, + "loss": 0.0, + "step": 22610 + }, + { + "epoch": 96.00666666666666, + "grad_norm": 9.716143540572375e-05, + "learning_rate": 3.7037037037037036e-07, + "loss": 0.0, + "step": 22620 + }, + { + "epoch": 96.00709401709402, + "grad_norm": 0.0006491367239505053, + "learning_rate": 3.65622032288699e-07, + "loss": 0.0001, + "step": 22630 + }, + { + "epoch": 96.00752136752136, + "grad_norm": 0.0002162454038625583, + "learning_rate": 3.6087369420702754e-07, + "loss": 0.0, + "step": 22640 + }, + { + "epoch": 96.00794871794872, + "grad_norm": 0.0001346368808299303, + "learning_rate": 3.5612535612535615e-07, + "loss": 0.0, + "step": 22650 + }, + { + "epoch": 96.00837606837607, + "grad_norm": 0.000659442157484591, + "learning_rate": 3.513770180436847e-07, + "loss": 0.0, + "step": 22660 + }, + { + "epoch": 96.00880341880342, + "grad_norm": 0.00012783669808413833, + "learning_rate": 3.4662867996201333e-07, + "loss": 0.0, + "step": 22670 + }, + { + "epoch": 96.00923076923077, + "grad_norm": 0.00012355089711491019, + "learning_rate": 3.4188034188034194e-07, + "loss": 0.0, + "step": 22680 + }, + { + "epoch": 96.00965811965811, + "grad_norm": 0.00010718983685364947, + "learning_rate": 3.371320037986705e-07, + "loss": 0.0, + "step": 22690 + }, + { + "epoch": 96.01, + "eval_accuracy": 0.4, + "eval_loss": 6.9261651039123535, + "eval_runtime": 37.5649, + "eval_samples_per_second": 0.666, + "eval_steps_per_second": 0.666, + "step": 22698 + }, + { + "epoch": 97.00008547008547, + "grad_norm": 0.00010189504973823205, + "learning_rate": 3.323836657169991e-07, + "loss": 0.0, + "step": 22700 + }, + { + "epoch": 97.00051282051282, + "grad_norm": 0.01328178122639656, + "learning_rate": 3.276353276353277e-07, + "loss": 0.4979, + "step": 22710 + }, + { + "epoch": 97.00094017094017, + "grad_norm": 0.00010843879863386974, + "learning_rate": 3.228869895536563e-07, + "loss": 0.0, + "step": 22720 + }, + { + "epoch": 97.00136752136753, + "grad_norm": 0.0005735277663916349, + "learning_rate": 3.181386514719848e-07, + "loss": 0.0, + "step": 22730 + }, + { + "epoch": 97.00179487179487, + "grad_norm": 0.0010003969073295593, + "learning_rate": 3.133903133903134e-07, + "loss": 0.0, + "step": 22740 + }, + { + "epoch": 97.00222222222222, + "grad_norm": 0.0009289424051530659, + "learning_rate": 3.08641975308642e-07, + "loss": 0.0, + "step": 22750 + }, + { + "epoch": 97.00264957264957, + "grad_norm": 0.0013247294118627906, + "learning_rate": 3.038936372269706e-07, + "loss": 0.0, + "step": 22760 + }, + { + "epoch": 97.00307692307692, + "grad_norm": 0.00010199108510278165, + "learning_rate": 2.991452991452992e-07, + "loss": 0.0, + "step": 22770 + }, + { + "epoch": 97.00350427350428, + "grad_norm": 0.0014735024888068438, + "learning_rate": 2.9439696106362777e-07, + "loss": 0.0, + "step": 22780 + }, + { + "epoch": 97.00393162393162, + "grad_norm": 0.00014969809853937477, + "learning_rate": 2.8964862298195633e-07, + "loss": 0.0, + "step": 22790 + }, + { + "epoch": 97.00435897435898, + "grad_norm": 0.0001152329205069691, + "learning_rate": 2.8490028490028494e-07, + "loss": 0.0, + "step": 22800 + }, + { + "epoch": 97.00478632478632, + "grad_norm": 0.0011336614843457937, + "learning_rate": 2.801519468186135e-07, + "loss": 0.0, + "step": 22810 + }, + { + "epoch": 97.00521367521368, + "grad_norm": 0.0002444600104354322, + "learning_rate": 2.754036087369421e-07, + "loss": 0.0, + "step": 22820 + }, + { + "epoch": 97.00564102564103, + "grad_norm": 0.0011971363564953208, + "learning_rate": 2.706552706552707e-07, + "loss": 0.0, + "step": 22830 + }, + { + "epoch": 97.00606837606837, + "grad_norm": 0.000934019626583904, + "learning_rate": 2.6590693257359924e-07, + "loss": 0.0, + "step": 22840 + }, + { + "epoch": 97.00649572649573, + "grad_norm": 0.00019381985475774854, + "learning_rate": 2.6115859449192786e-07, + "loss": 0.0, + "step": 22850 + }, + { + "epoch": 97.00692307692307, + "grad_norm": 0.00011170632933499292, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0, + "step": 22860 + }, + { + "epoch": 97.00735042735043, + "grad_norm": 0.00010790720989461988, + "learning_rate": 2.51661918328585e-07, + "loss": 0.0, + "step": 22870 + }, + { + "epoch": 97.00777777777778, + "grad_norm": 0.0001576395152369514, + "learning_rate": 2.469135802469136e-07, + "loss": 0.0, + "step": 22880 + }, + { + "epoch": 97.00820512820513, + "grad_norm": 0.00010023313370766118, + "learning_rate": 2.4216524216524215e-07, + "loss": 0.0, + "step": 22890 + }, + { + "epoch": 97.00863247863248, + "grad_norm": 0.0016645913710817695, + "learning_rate": 2.374169040835708e-07, + "loss": 0.0, + "step": 22900 + }, + { + "epoch": 97.00905982905982, + "grad_norm": 0.0006122203776612878, + "learning_rate": 2.3266856600189936e-07, + "loss": 0.0, + "step": 22910 + }, + { + "epoch": 97.00948717948718, + "grad_norm": 0.0002639706071931869, + "learning_rate": 2.2792022792022794e-07, + "loss": 0.0, + "step": 22920 + }, + { + "epoch": 97.00991452991452, + "grad_norm": 0.00010460082557983696, + "learning_rate": 2.2317188983855653e-07, + "loss": 0.0, + "step": 22930 + }, + { + "epoch": 97.01, + "eval_accuracy": 0.4, + "eval_loss": 6.775699615478516, + "eval_runtime": 36.4004, + "eval_samples_per_second": 0.687, + "eval_steps_per_second": 0.687, + "step": 22932 + }, + { + "epoch": 98.00034188034188, + "grad_norm": 0.0007676715613342822, + "learning_rate": 2.1842355175688512e-07, + "loss": 0.0, + "step": 22940 + }, + { + "epoch": 98.00076923076924, + "grad_norm": 0.00042605900671333075, + "learning_rate": 2.136752136752137e-07, + "loss": 0.0, + "step": 22950 + }, + { + "epoch": 98.00119658119658, + "grad_norm": 0.0001429203402949497, + "learning_rate": 2.0892687559354227e-07, + "loss": 0.0, + "step": 22960 + }, + { + "epoch": 98.00162393162393, + "grad_norm": 0.0004025712551083416, + "learning_rate": 2.0417853751187086e-07, + "loss": 0.0, + "step": 22970 + }, + { + "epoch": 98.00205128205128, + "grad_norm": 0.001206466811709106, + "learning_rate": 1.9943019943019944e-07, + "loss": 0.0, + "step": 22980 + }, + { + "epoch": 98.00247863247863, + "grad_norm": 0.00012541662727016956, + "learning_rate": 1.9468186134852803e-07, + "loss": 0.3638, + "step": 22990 + }, + { + "epoch": 98.00290598290599, + "grad_norm": 0.0002750447019934654, + "learning_rate": 1.8993352326685662e-07, + "loss": 0.0, + "step": 23000 + }, + { + "epoch": 98.00333333333333, + "grad_norm": 0.00010290797217749059, + "learning_rate": 1.8518518518518518e-07, + "loss": 0.0, + "step": 23010 + }, + { + "epoch": 98.00376068376069, + "grad_norm": 0.0001165928115369752, + "learning_rate": 1.8043684710351377e-07, + "loss": 0.0, + "step": 23020 + }, + { + "epoch": 98.00418803418803, + "grad_norm": 0.0001025166129693389, + "learning_rate": 1.7568850902184236e-07, + "loss": 0.0, + "step": 23030 + }, + { + "epoch": 98.00461538461539, + "grad_norm": 9.691870218375698e-05, + "learning_rate": 1.7094017094017097e-07, + "loss": 0.0, + "step": 23040 + }, + { + "epoch": 98.00504273504274, + "grad_norm": 0.00016511735157109797, + "learning_rate": 1.6619183285849956e-07, + "loss": 0.0, + "step": 23050 + }, + { + "epoch": 98.00547008547008, + "grad_norm": 0.00010559390648268163, + "learning_rate": 1.6144349477682815e-07, + "loss": 0.0003, + "step": 23060 + }, + { + "epoch": 98.00589743589744, + "grad_norm": 0.0008746449020691216, + "learning_rate": 1.566951566951567e-07, + "loss": 0.0, + "step": 23070 + }, + { + "epoch": 98.00632478632478, + "grad_norm": 0.0002250542602268979, + "learning_rate": 1.519468186134853e-07, + "loss": 0.0, + "step": 23080 + }, + { + "epoch": 98.00675213675214, + "grad_norm": 0.00017459446098655462, + "learning_rate": 1.4719848053181388e-07, + "loss": 0.0, + "step": 23090 + }, + { + "epoch": 98.00717948717949, + "grad_norm": 0.00018693608581088483, + "learning_rate": 1.4245014245014247e-07, + "loss": 0.0, + "step": 23100 + }, + { + "epoch": 98.00760683760684, + "grad_norm": 0.0016856353031471372, + "learning_rate": 1.3770180436847106e-07, + "loss": 0.0, + "step": 23110 + }, + { + "epoch": 98.00803418803419, + "grad_norm": 0.00015905691543594003, + "learning_rate": 1.3295346628679962e-07, + "loss": 0.0, + "step": 23120 + }, + { + "epoch": 98.00846153846153, + "grad_norm": 0.00016817098367027938, + "learning_rate": 1.282051282051282e-07, + "loss": 0.0, + "step": 23130 + }, + { + "epoch": 98.00888888888889, + "grad_norm": 0.00019543507369235158, + "learning_rate": 1.234567901234568e-07, + "loss": 0.0, + "step": 23140 + }, + { + "epoch": 98.00931623931623, + "grad_norm": 0.0006627991679124534, + "learning_rate": 1.187084520417854e-07, + "loss": 0.0, + "step": 23150 + }, + { + "epoch": 98.0097435897436, + "grad_norm": 0.00031626957934349775, + "learning_rate": 1.1396011396011397e-07, + "loss": 0.0, + "step": 23160 + }, + { + "epoch": 98.01, + "eval_accuracy": 0.4, + "eval_loss": 6.829818248748779, + "eval_runtime": 35.756, + "eval_samples_per_second": 0.699, + "eval_steps_per_second": 0.699, + "step": 23166 + }, + { + "epoch": 99.00017094017095, + "grad_norm": 0.00048812164459377527, + "learning_rate": 1.0921177587844256e-07, + "loss": 0.0, + "step": 23170 + }, + { + "epoch": 99.00059829059829, + "grad_norm": 0.0003830118221230805, + "learning_rate": 1.0446343779677113e-07, + "loss": 0.0, + "step": 23180 + }, + { + "epoch": 99.00102564102563, + "grad_norm": 0.00040612902375869453, + "learning_rate": 9.971509971509972e-08, + "loss": 0.0, + "step": 23190 + }, + { + "epoch": 99.001452991453, + "grad_norm": 0.00012840931594837457, + "learning_rate": 9.496676163342831e-08, + "loss": 0.0, + "step": 23200 + }, + { + "epoch": 99.00188034188034, + "grad_norm": 0.0002664999628905207, + "learning_rate": 9.021842355175688e-08, + "loss": 0.0, + "step": 23210 + }, + { + "epoch": 99.0023076923077, + "grad_norm": 0.00010337825369788334, + "learning_rate": 8.547008547008549e-08, + "loss": 0.0, + "step": 23220 + }, + { + "epoch": 99.00273504273504, + "grad_norm": 0.0008002725662663579, + "learning_rate": 8.072174738841407e-08, + "loss": 0.0, + "step": 23230 + }, + { + "epoch": 99.0031623931624, + "grad_norm": 0.00012994400458410382, + "learning_rate": 7.597340930674265e-08, + "loss": 0.0, + "step": 23240 + }, + { + "epoch": 99.00358974358974, + "grad_norm": 0.0011528537143021822, + "learning_rate": 7.122507122507124e-08, + "loss": 0.0, + "step": 23250 + }, + { + "epoch": 99.0040170940171, + "grad_norm": 0.00013322003360372037, + "learning_rate": 6.647673314339981e-08, + "loss": 0.0, + "step": 23260 + }, + { + "epoch": 99.00444444444445, + "grad_norm": 0.001754231285303831, + "learning_rate": 6.17283950617284e-08, + "loss": 0.0, + "step": 23270 + }, + { + "epoch": 99.00487179487179, + "grad_norm": 0.0002544873859733343, + "learning_rate": 5.6980056980056986e-08, + "loss": 0.0, + "step": 23280 + }, + { + "epoch": 99.00529914529915, + "grad_norm": 0.0005114562809467316, + "learning_rate": 5.223171889838557e-08, + "loss": 0.0, + "step": 23290 + }, + { + "epoch": 99.00572649572649, + "grad_norm": 0.00014945660950616002, + "learning_rate": 4.7483380816714155e-08, + "loss": 0.0, + "step": 23300 + }, + { + "epoch": 99.00615384615385, + "grad_norm": 0.0005400913069024682, + "learning_rate": 4.273504273504274e-08, + "loss": 0.0, + "step": 23310 + }, + { + "epoch": 99.0065811965812, + "grad_norm": 0.0001350509119220078, + "learning_rate": 3.7986704653371324e-08, + "loss": 0.0, + "step": 23320 + }, + { + "epoch": 99.00700854700855, + "grad_norm": 0.00013444873911794275, + "learning_rate": 3.3238366571699905e-08, + "loss": 0.0, + "step": 23330 + }, + { + "epoch": 99.0074358974359, + "grad_norm": 0.0003640491340775043, + "learning_rate": 2.8490028490028493e-08, + "loss": 0.0, + "step": 23340 + }, + { + "epoch": 99.00786324786324, + "grad_norm": 0.0004577758372761309, + "learning_rate": 2.3741690408357078e-08, + "loss": 0.0, + "step": 23350 + }, + { + "epoch": 99.0082905982906, + "grad_norm": 0.0005503903958015144, + "learning_rate": 1.8993352326685662e-08, + "loss": 0.0, + "step": 23360 + }, + { + "epoch": 99.00871794871794, + "grad_norm": 0.0008972916402854025, + "learning_rate": 1.4245014245014247e-08, + "loss": 0.0, + "step": 23370 + }, + { + "epoch": 99.0091452991453, + "grad_norm": 0.00013151808525435627, + "learning_rate": 9.496676163342831e-09, + "loss": 0.0, + "step": 23380 + }, + { + "epoch": 99.00957264957265, + "grad_norm": 0.00012301822425797582, + "learning_rate": 4.7483380816714155e-09, + "loss": 0.0, + "step": 23390 + }, + { + "epoch": 99.01, + "grad_norm": 0.002225738950073719, + "learning_rate": 0.0, + "loss": 0.0, + "step": 23400 + }, + { + "epoch": 99.01, + "eval_accuracy": 0.4, + "eval_loss": 6.831691741943359, + "eval_runtime": 36.0994, + "eval_samples_per_second": 0.693, + "eval_steps_per_second": 0.693, + "step": 23400 + }, + { + "epoch": 99.01, + "step": 23400, + "total_flos": 1.0275070897744773e+20, + "train_loss": 0.44932733817713766, + "train_runtime": 74953.482, + "train_samples_per_second": 0.312, + "train_steps_per_second": 0.312 + }, + { + "epoch": 99.01, + "eval_accuracy": 0.48, + "eval_loss": 4.4508891105651855, + "eval_runtime": 32.6922, + "eval_samples_per_second": 0.765, + "eval_steps_per_second": 0.765, + "step": 23400 + }, + { + "epoch": 99.01, + "eval_accuracy": 0.48, + "eval_loss": 4.450888633728027, + "eval_runtime": 32.6242, + "eval_samples_per_second": 0.766, + "eval_steps_per_second": 0.766, + "step": 23400 + } + ], + "logging_steps": 10, + "max_steps": 23400, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0275070897744773e+20, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}