diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,22223 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.997639653815893, + "eval_steps": 500, + "global_step": 3170, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003147128245476003, + "grad_norm": 2.0566761052521856, + "learning_rate": 4.1666666666666667e-07, + "loss": 1.0248, + "step": 1 + }, + { + "epoch": 0.006294256490952006, + "grad_norm": 2.078116756187581, + "learning_rate": 8.333333333333333e-07, + "loss": 1.0183, + "step": 2 + }, + { + "epoch": 0.00944138473642801, + "grad_norm": 2.055186894692077, + "learning_rate": 1.25e-06, + "loss": 1.0165, + "step": 3 + }, + { + "epoch": 0.012588512981904013, + "grad_norm": 2.0254172121373073, + "learning_rate": 1.6666666666666667e-06, + "loss": 1.0132, + "step": 4 + }, + { + "epoch": 0.015735641227380016, + "grad_norm": 1.979378254901161, + "learning_rate": 2.0833333333333334e-06, + "loss": 1.0338, + "step": 5 + }, + { + "epoch": 0.01888276947285602, + "grad_norm": 1.6978990047138407, + "learning_rate": 2.5e-06, + "loss": 1.0147, + "step": 6 + }, + { + "epoch": 0.022029897718332022, + "grad_norm": 1.5934719348056317, + "learning_rate": 2.916666666666667e-06, + "loss": 1.0025, + "step": 7 + }, + { + "epoch": 0.025177025963808025, + "grad_norm": 1.1983077219680367, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.9763, + "step": 8 + }, + { + "epoch": 0.02832415420928403, + "grad_norm": 1.0943483464567008, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.9673, + "step": 9 + }, + { + "epoch": 0.03147128245476003, + "grad_norm": 1.0795799352113267, + "learning_rate": 4.166666666666667e-06, + "loss": 0.9625, + "step": 10 + }, + { + "epoch": 0.03461841070023604, + "grad_norm": 1.2586895987651956, + "learning_rate": 4.583333333333333e-06, + "loss": 0.9443, + "step": 11 + }, + { + "epoch": 0.03776553894571204, + "grad_norm": 1.2360462710902367, + "learning_rate": 5e-06, + "loss": 0.952, + "step": 12 + }, + { + "epoch": 0.040912667191188044, + "grad_norm": 1.158712657793634, + "learning_rate": 5.416666666666667e-06, + "loss": 0.9186, + "step": 13 + }, + { + "epoch": 0.044059795436664044, + "grad_norm": 0.9342993351221153, + "learning_rate": 5.833333333333334e-06, + "loss": 0.8874, + "step": 14 + }, + { + "epoch": 0.04720692368214005, + "grad_norm": 1.0504874222027794, + "learning_rate": 6.25e-06, + "loss": 0.878, + "step": 15 + }, + { + "epoch": 0.05035405192761605, + "grad_norm": 0.8975104331120672, + "learning_rate": 6.666666666666667e-06, + "loss": 0.8668, + "step": 16 + }, + { + "epoch": 0.05350118017309206, + "grad_norm": 0.6476430875482199, + "learning_rate": 7.083333333333335e-06, + "loss": 0.8655, + "step": 17 + }, + { + "epoch": 0.05664830841856806, + "grad_norm": 0.49682103011953394, + "learning_rate": 7.500000000000001e-06, + "loss": 0.8502, + "step": 18 + }, + { + "epoch": 0.05979543666404406, + "grad_norm": 0.5685849690063021, + "learning_rate": 7.916666666666667e-06, + "loss": 0.8249, + "step": 19 + }, + { + "epoch": 0.06294256490952006, + "grad_norm": 0.7286039018171099, + "learning_rate": 8.333333333333334e-06, + "loss": 0.8183, + "step": 20 + }, + { + "epoch": 0.06608969315499606, + "grad_norm": 0.650325267587393, + "learning_rate": 8.750000000000001e-06, + "loss": 0.8078, + "step": 21 + }, + { + "epoch": 0.06923682140047208, + "grad_norm": 0.524249002042332, + "learning_rate": 9.166666666666666e-06, + "loss": 0.7968, + "step": 22 + }, + { + "epoch": 0.07238394964594808, + "grad_norm": 0.42266135038030506, + "learning_rate": 9.583333333333335e-06, + "loss": 0.793, + "step": 23 + }, + { + "epoch": 0.07553107789142408, + "grad_norm": 0.45652357144630545, + "learning_rate": 1e-05, + "loss": 0.786, + "step": 24 + }, + { + "epoch": 0.07867820613690008, + "grad_norm": 0.48851305915388266, + "learning_rate": 1.0416666666666668e-05, + "loss": 0.7842, + "step": 25 + }, + { + "epoch": 0.08182533438237609, + "grad_norm": 0.47219631692611636, + "learning_rate": 1.0833333333333334e-05, + "loss": 0.7726, + "step": 26 + }, + { + "epoch": 0.08497246262785209, + "grad_norm": 0.39201363065054773, + "learning_rate": 1.125e-05, + "loss": 0.768, + "step": 27 + }, + { + "epoch": 0.08811959087332809, + "grad_norm": 0.3342565416438565, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.7589, + "step": 28 + }, + { + "epoch": 0.09126671911880409, + "grad_norm": 0.35827703185804977, + "learning_rate": 1.2083333333333333e-05, + "loss": 0.7723, + "step": 29 + }, + { + "epoch": 0.0944138473642801, + "grad_norm": 0.3625916446194259, + "learning_rate": 1.25e-05, + "loss": 0.7524, + "step": 30 + }, + { + "epoch": 0.0975609756097561, + "grad_norm": 0.32147227722705174, + "learning_rate": 1.2916666666666668e-05, + "loss": 0.7462, + "step": 31 + }, + { + "epoch": 0.1007081038552321, + "grad_norm": 0.3105919347762339, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.7432, + "step": 32 + }, + { + "epoch": 0.1038552321007081, + "grad_norm": 0.2941313278165609, + "learning_rate": 1.375e-05, + "loss": 0.7487, + "step": 33 + }, + { + "epoch": 0.10700236034618411, + "grad_norm": 0.2847875994844311, + "learning_rate": 1.416666666666667e-05, + "loss": 0.7279, + "step": 34 + }, + { + "epoch": 0.11014948859166011, + "grad_norm": 0.29110050664950804, + "learning_rate": 1.4583333333333333e-05, + "loss": 0.7264, + "step": 35 + }, + { + "epoch": 0.11329661683713611, + "grad_norm": 0.2758326744258242, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.7289, + "step": 36 + }, + { + "epoch": 0.11644374508261211, + "grad_norm": 0.25172506909717546, + "learning_rate": 1.5416666666666668e-05, + "loss": 0.7233, + "step": 37 + }, + { + "epoch": 0.11959087332808813, + "grad_norm": 0.2472744394585722, + "learning_rate": 1.5833333333333333e-05, + "loss": 0.729, + "step": 38 + }, + { + "epoch": 0.12273800157356413, + "grad_norm": 0.2646648296393675, + "learning_rate": 1.6250000000000002e-05, + "loss": 0.7279, + "step": 39 + }, + { + "epoch": 0.12588512981904013, + "grad_norm": 0.24358773217689053, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.7184, + "step": 40 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 0.2393241155630341, + "learning_rate": 1.7083333333333333e-05, + "loss": 0.7136, + "step": 41 + }, + { + "epoch": 0.13217938630999213, + "grad_norm": 0.2330003474602153, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.711, + "step": 42 + }, + { + "epoch": 0.13532651455546812, + "grad_norm": 0.22294095752365714, + "learning_rate": 1.7916666666666667e-05, + "loss": 0.7126, + "step": 43 + }, + { + "epoch": 0.13847364280094415, + "grad_norm": 0.23816885540234745, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.7188, + "step": 44 + }, + { + "epoch": 0.14162077104642015, + "grad_norm": 0.2257889298086421, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.6991, + "step": 45 + }, + { + "epoch": 0.14476789929189615, + "grad_norm": 0.20099324635222396, + "learning_rate": 1.916666666666667e-05, + "loss": 0.7006, + "step": 46 + }, + { + "epoch": 0.14791502753737215, + "grad_norm": 0.25186139333784574, + "learning_rate": 1.9583333333333333e-05, + "loss": 0.7087, + "step": 47 + }, + { + "epoch": 0.15106215578284815, + "grad_norm": 0.2232374205375328, + "learning_rate": 2e-05, + "loss": 0.6971, + "step": 48 + }, + { + "epoch": 0.15420928402832415, + "grad_norm": 0.21825531385293007, + "learning_rate": 2.0416666666666667e-05, + "loss": 0.697, + "step": 49 + }, + { + "epoch": 0.15735641227380015, + "grad_norm": 0.21596204587349424, + "learning_rate": 2.0833333333333336e-05, + "loss": 0.6887, + "step": 50 + }, + { + "epoch": 0.16050354051927615, + "grad_norm": 0.23116942027734438, + "learning_rate": 2.125e-05, + "loss": 0.6885, + "step": 51 + }, + { + "epoch": 0.16365066876475218, + "grad_norm": 0.21015812381257615, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.6866, + "step": 52 + }, + { + "epoch": 0.16679779701022818, + "grad_norm": 0.19996909500963955, + "learning_rate": 2.2083333333333336e-05, + "loss": 0.6898, + "step": 53 + }, + { + "epoch": 0.16994492525570418, + "grad_norm": 0.20997251324092625, + "learning_rate": 2.25e-05, + "loss": 0.6836, + "step": 54 + }, + { + "epoch": 0.17309205350118018, + "grad_norm": 0.20108945450898513, + "learning_rate": 2.2916666666666667e-05, + "loss": 0.6868, + "step": 55 + }, + { + "epoch": 0.17623918174665618, + "grad_norm": 0.2035143838254788, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.6936, + "step": 56 + }, + { + "epoch": 0.17938630999213218, + "grad_norm": 0.2004298904967849, + "learning_rate": 2.375e-05, + "loss": 0.6746, + "step": 57 + }, + { + "epoch": 0.18253343823760818, + "grad_norm": 0.20059328010088198, + "learning_rate": 2.4166666666666667e-05, + "loss": 0.682, + "step": 58 + }, + { + "epoch": 0.18568056648308418, + "grad_norm": 0.21755269002083433, + "learning_rate": 2.4583333333333336e-05, + "loss": 0.6735, + "step": 59 + }, + { + "epoch": 0.1888276947285602, + "grad_norm": 0.2129373116359228, + "learning_rate": 2.5e-05, + "loss": 0.6663, + "step": 60 + }, + { + "epoch": 0.1919748229740362, + "grad_norm": 0.1995735536259152, + "learning_rate": 2.5416666666666667e-05, + "loss": 0.6787, + "step": 61 + }, + { + "epoch": 0.1951219512195122, + "grad_norm": 0.23037748881825523, + "learning_rate": 2.5833333333333336e-05, + "loss": 0.6703, + "step": 62 + }, + { + "epoch": 0.1982690794649882, + "grad_norm": 0.18391751461207972, + "learning_rate": 2.625e-05, + "loss": 0.6764, + "step": 63 + }, + { + "epoch": 0.2014162077104642, + "grad_norm": 0.2123421226257098, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.6714, + "step": 64 + }, + { + "epoch": 0.2045633359559402, + "grad_norm": 0.20183153602864587, + "learning_rate": 2.7083333333333335e-05, + "loss": 0.6656, + "step": 65 + }, + { + "epoch": 0.2077104642014162, + "grad_norm": 0.19119357792446254, + "learning_rate": 2.75e-05, + "loss": 0.6684, + "step": 66 + }, + { + "epoch": 0.2108575924468922, + "grad_norm": 0.20177148219300692, + "learning_rate": 2.7916666666666666e-05, + "loss": 0.6458, + "step": 67 + }, + { + "epoch": 0.21400472069236823, + "grad_norm": 0.22326018847799878, + "learning_rate": 2.833333333333334e-05, + "loss": 0.6659, + "step": 68 + }, + { + "epoch": 0.21715184893784423, + "grad_norm": 0.22960589662619602, + "learning_rate": 2.875e-05, + "loss": 0.6814, + "step": 69 + }, + { + "epoch": 0.22029897718332023, + "grad_norm": 0.20556408160244669, + "learning_rate": 2.9166666666666666e-05, + "loss": 0.6651, + "step": 70 + }, + { + "epoch": 0.22344610542879623, + "grad_norm": 0.23091408485344644, + "learning_rate": 2.958333333333334e-05, + "loss": 0.6587, + "step": 71 + }, + { + "epoch": 0.22659323367427223, + "grad_norm": 0.24593395754345967, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.6559, + "step": 72 + }, + { + "epoch": 0.22974036191974823, + "grad_norm": 0.2941140735112936, + "learning_rate": 3.0416666666666666e-05, + "loss": 0.6523, + "step": 73 + }, + { + "epoch": 0.23288749016522423, + "grad_norm": 0.2726093572840182, + "learning_rate": 3.0833333333333335e-05, + "loss": 0.6685, + "step": 74 + }, + { + "epoch": 0.23603461841070023, + "grad_norm": 0.28195810887905565, + "learning_rate": 3.125e-05, + "loss": 0.6614, + "step": 75 + }, + { + "epoch": 0.23918174665617625, + "grad_norm": 0.24022069913129832, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.6593, + "step": 76 + }, + { + "epoch": 0.24232887490165225, + "grad_norm": 0.25139586251835144, + "learning_rate": 3.208333333333334e-05, + "loss": 0.659, + "step": 77 + }, + { + "epoch": 0.24547600314712825, + "grad_norm": 0.2566709925175564, + "learning_rate": 3.2500000000000004e-05, + "loss": 0.6542, + "step": 78 + }, + { + "epoch": 0.24862313139260425, + "grad_norm": 0.2883214448935213, + "learning_rate": 3.291666666666667e-05, + "loss": 0.6471, + "step": 79 + }, + { + "epoch": 0.25177025963808025, + "grad_norm": 0.30668415687027056, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.6439, + "step": 80 + }, + { + "epoch": 0.2549173878835563, + "grad_norm": 0.29042450307830464, + "learning_rate": 3.375e-05, + "loss": 0.651, + "step": 81 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 0.2736791002695721, + "learning_rate": 3.4166666666666666e-05, + "loss": 0.6467, + "step": 82 + }, + { + "epoch": 0.2612116443745083, + "grad_norm": 0.265465779092424, + "learning_rate": 3.458333333333334e-05, + "loss": 0.6412, + "step": 83 + }, + { + "epoch": 0.26435877261998425, + "grad_norm": 0.2968535790814613, + "learning_rate": 3.5000000000000004e-05, + "loss": 0.6574, + "step": 84 + }, + { + "epoch": 0.2675059008654603, + "grad_norm": 0.43190214956783235, + "learning_rate": 3.541666666666667e-05, + "loss": 0.6495, + "step": 85 + }, + { + "epoch": 0.27065302911093625, + "grad_norm": 0.632308039014685, + "learning_rate": 3.5833333333333335e-05, + "loss": 0.6515, + "step": 86 + }, + { + "epoch": 0.2738001573564123, + "grad_norm": 0.7849780285031561, + "learning_rate": 3.625e-05, + "loss": 0.6546, + "step": 87 + }, + { + "epoch": 0.2769472856018883, + "grad_norm": 0.7233136246737597, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.6468, + "step": 88 + }, + { + "epoch": 0.2800944138473643, + "grad_norm": 0.5392685671512011, + "learning_rate": 3.708333333333334e-05, + "loss": 0.6366, + "step": 89 + }, + { + "epoch": 0.2832415420928403, + "grad_norm": 0.6548726987366142, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.6433, + "step": 90 + }, + { + "epoch": 0.2863886703383163, + "grad_norm": 0.74072769066675, + "learning_rate": 3.791666666666667e-05, + "loss": 0.6421, + "step": 91 + }, + { + "epoch": 0.2895357985837923, + "grad_norm": 0.37857830042275176, + "learning_rate": 3.833333333333334e-05, + "loss": 0.65, + "step": 92 + }, + { + "epoch": 0.2926829268292683, + "grad_norm": 0.5508648164496704, + "learning_rate": 3.875e-05, + "loss": 0.6493, + "step": 93 + }, + { + "epoch": 0.2958300550747443, + "grad_norm": 0.5478861545420471, + "learning_rate": 3.9166666666666665e-05, + "loss": 0.6469, + "step": 94 + }, + { + "epoch": 0.2989771833202203, + "grad_norm": 0.3468090216652617, + "learning_rate": 3.958333333333334e-05, + "loss": 0.6514, + "step": 95 + }, + { + "epoch": 0.3021243115656963, + "grad_norm": 0.6547220457148604, + "learning_rate": 4e-05, + "loss": 0.649, + "step": 96 + }, + { + "epoch": 0.30527143981117233, + "grad_norm": 0.46761393726738054, + "learning_rate": 3.999999059985635e-05, + "loss": 0.6408, + "step": 97 + }, + { + "epoch": 0.3084185680566483, + "grad_norm": 0.39367909064047446, + "learning_rate": 3.99999623994352e-05, + "loss": 0.6365, + "step": 98 + }, + { + "epoch": 0.31156569630212433, + "grad_norm": 0.5946767742649087, + "learning_rate": 3.9999915398766006e-05, + "loss": 0.6366, + "step": 99 + }, + { + "epoch": 0.3147128245476003, + "grad_norm": 0.31375774268214407, + "learning_rate": 3.999984959789786e-05, + "loss": 0.6389, + "step": 100 + }, + { + "epoch": 0.31785995279307633, + "grad_norm": 0.5057217370873666, + "learning_rate": 3.9999764996899494e-05, + "loss": 0.6457, + "step": 101 + }, + { + "epoch": 0.3210070810385523, + "grad_norm": 0.35265559358910226, + "learning_rate": 3.9999661595859275e-05, + "loss": 0.6438, + "step": 102 + }, + { + "epoch": 0.3241542092840283, + "grad_norm": 0.341984180495186, + "learning_rate": 3.9999539394885177e-05, + "loss": 0.6275, + "step": 103 + }, + { + "epoch": 0.32730133752950435, + "grad_norm": 0.3862289663549392, + "learning_rate": 3.999939839410486e-05, + "loss": 0.6279, + "step": 104 + }, + { + "epoch": 0.3304484657749803, + "grad_norm": 0.30610508770190564, + "learning_rate": 3.999923859366557e-05, + "loss": 0.6335, + "step": 105 + }, + { + "epoch": 0.33359559402045635, + "grad_norm": 0.39738483622597887, + "learning_rate": 3.999905999373424e-05, + "loss": 0.6275, + "step": 106 + }, + { + "epoch": 0.3367427222659323, + "grad_norm": 0.34695466353973403, + "learning_rate": 3.9998862594497396e-05, + "loss": 0.634, + "step": 107 + }, + { + "epoch": 0.33988985051140835, + "grad_norm": 0.4434518808465586, + "learning_rate": 3.999864639616121e-05, + "loss": 0.6374, + "step": 108 + }, + { + "epoch": 0.3430369787568843, + "grad_norm": 0.33772070770009105, + "learning_rate": 3.99984113989515e-05, + "loss": 0.6266, + "step": 109 + }, + { + "epoch": 0.34618410700236035, + "grad_norm": 0.2584585866122632, + "learning_rate": 3.99981576031137e-05, + "loss": 0.6292, + "step": 110 + }, + { + "epoch": 0.3493312352478363, + "grad_norm": 0.3611261393186681, + "learning_rate": 3.9997885008912905e-05, + "loss": 0.6361, + "step": 111 + }, + { + "epoch": 0.35247836349331235, + "grad_norm": 0.3023341429429724, + "learning_rate": 3.999759361663381e-05, + "loss": 0.6325, + "step": 112 + }, + { + "epoch": 0.3556254917387884, + "grad_norm": 0.30908333541351135, + "learning_rate": 3.999728342658079e-05, + "loss": 0.6368, + "step": 113 + }, + { + "epoch": 0.35877261998426435, + "grad_norm": 0.265928899655407, + "learning_rate": 3.999695443907781e-05, + "loss": 0.6303, + "step": 114 + }, + { + "epoch": 0.3619197482297404, + "grad_norm": 0.27333927680685793, + "learning_rate": 3.9996606654468476e-05, + "loss": 0.6277, + "step": 115 + }, + { + "epoch": 0.36506687647521635, + "grad_norm": 0.2744818724487684, + "learning_rate": 3.9996240073116044e-05, + "loss": 0.6272, + "step": 116 + }, + { + "epoch": 0.3682140047206924, + "grad_norm": 0.2869505492537586, + "learning_rate": 3.99958546954034e-05, + "loss": 0.6165, + "step": 117 + }, + { + "epoch": 0.37136113296616835, + "grad_norm": 0.26133884085799125, + "learning_rate": 3.9995450521733044e-05, + "loss": 0.6303, + "step": 118 + }, + { + "epoch": 0.3745082612116444, + "grad_norm": 0.28364779766814496, + "learning_rate": 3.9995027552527126e-05, + "loss": 0.6355, + "step": 119 + }, + { + "epoch": 0.3776553894571204, + "grad_norm": 0.26991948715363395, + "learning_rate": 3.9994585788227425e-05, + "loss": 0.6353, + "step": 120 + }, + { + "epoch": 0.3808025177025964, + "grad_norm": 0.29168641093859365, + "learning_rate": 3.9994125229295335e-05, + "loss": 0.6347, + "step": 121 + }, + { + "epoch": 0.3839496459480724, + "grad_norm": 0.2778112649452421, + "learning_rate": 3.999364587621189e-05, + "loss": 0.6314, + "step": 122 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.25501755874765036, + "learning_rate": 3.9993147729477775e-05, + "loss": 0.6287, + "step": 123 + }, + { + "epoch": 0.3902439024390244, + "grad_norm": 0.34414954964963435, + "learning_rate": 3.999263078961327e-05, + "loss": 0.6278, + "step": 124 + }, + { + "epoch": 0.3933910306845004, + "grad_norm": 0.34610957165859696, + "learning_rate": 3.9992095057158304e-05, + "loss": 0.6216, + "step": 125 + }, + { + "epoch": 0.3965381589299764, + "grad_norm": 0.28674531389146546, + "learning_rate": 3.999154053267242e-05, + "loss": 0.638, + "step": 126 + }, + { + "epoch": 0.3996852871754524, + "grad_norm": 0.25576505127419086, + "learning_rate": 3.99909672167348e-05, + "loss": 0.6259, + "step": 127 + }, + { + "epoch": 0.4028324154209284, + "grad_norm": 0.31068566841934725, + "learning_rate": 3.9990375109944254e-05, + "loss": 0.6266, + "step": 128 + }, + { + "epoch": 0.40597954366640443, + "grad_norm": 0.43495770039787945, + "learning_rate": 3.998976421291921e-05, + "loss": 0.6194, + "step": 129 + }, + { + "epoch": 0.4091266719118804, + "grad_norm": 0.40876214950723583, + "learning_rate": 3.998913452629773e-05, + "loss": 0.6261, + "step": 130 + }, + { + "epoch": 0.41227380015735643, + "grad_norm": 0.26678746806822895, + "learning_rate": 3.998848605073749e-05, + "loss": 0.63, + "step": 131 + }, + { + "epoch": 0.4154209284028324, + "grad_norm": 0.2878327006301991, + "learning_rate": 3.9987818786915807e-05, + "loss": 0.6204, + "step": 132 + }, + { + "epoch": 0.41856805664830843, + "grad_norm": 0.3111848739668028, + "learning_rate": 3.9987132735529594e-05, + "loss": 0.6297, + "step": 133 + }, + { + "epoch": 0.4217151848937844, + "grad_norm": 0.2609480696864346, + "learning_rate": 3.998642789729543e-05, + "loss": 0.6231, + "step": 134 + }, + { + "epoch": 0.42486231313926043, + "grad_norm": 0.2811609267853307, + "learning_rate": 3.998570427294947e-05, + "loss": 0.6187, + "step": 135 + }, + { + "epoch": 0.42800944138473646, + "grad_norm": 0.3489218624854075, + "learning_rate": 3.998496186324753e-05, + "loss": 0.6286, + "step": 136 + }, + { + "epoch": 0.4311565696302124, + "grad_norm": 0.3209965825815324, + "learning_rate": 3.9984200668965e-05, + "loss": 0.6146, + "step": 137 + }, + { + "epoch": 0.43430369787568845, + "grad_norm": 0.2527692210447605, + "learning_rate": 3.998342069089694e-05, + "loss": 0.6203, + "step": 138 + }, + { + "epoch": 0.4374508261211644, + "grad_norm": 0.2863723927860823, + "learning_rate": 3.9982621929857994e-05, + "loss": 0.6186, + "step": 139 + }, + { + "epoch": 0.44059795436664045, + "grad_norm": 0.3362120402580175, + "learning_rate": 3.998180438668244e-05, + "loss": 0.6173, + "step": 140 + }, + { + "epoch": 0.4437450826121164, + "grad_norm": 0.29944731618068043, + "learning_rate": 3.998096806222417e-05, + "loss": 0.6079, + "step": 141 + }, + { + "epoch": 0.44689221085759245, + "grad_norm": 0.2521452559671069, + "learning_rate": 3.9980112957356705e-05, + "loss": 0.6249, + "step": 142 + }, + { + "epoch": 0.4500393391030684, + "grad_norm": 0.2545938988617545, + "learning_rate": 3.997923907297315e-05, + "loss": 0.6083, + "step": 143 + }, + { + "epoch": 0.45318646734854445, + "grad_norm": 0.25898746551692964, + "learning_rate": 3.997834640998624e-05, + "loss": 0.6146, + "step": 144 + }, + { + "epoch": 0.4563335955940205, + "grad_norm": 0.2771475593887788, + "learning_rate": 3.9977434969328344e-05, + "loss": 0.6155, + "step": 145 + }, + { + "epoch": 0.45948072383949645, + "grad_norm": 0.2715220470047786, + "learning_rate": 3.9976504751951415e-05, + "loss": 0.6139, + "step": 146 + }, + { + "epoch": 0.4626278520849725, + "grad_norm": 0.262357157875343, + "learning_rate": 3.997555575882702e-05, + "loss": 0.6109, + "step": 147 + }, + { + "epoch": 0.46577498033044845, + "grad_norm": 0.2656139774401674, + "learning_rate": 3.9974587990946365e-05, + "loss": 0.6195, + "step": 148 + }, + { + "epoch": 0.4689221085759245, + "grad_norm": 0.27484911731602474, + "learning_rate": 3.997360144932023e-05, + "loss": 0.6167, + "step": 149 + }, + { + "epoch": 0.47206923682140045, + "grad_norm": 0.27057626890655806, + "learning_rate": 3.997259613497902e-05, + "loss": 0.6268, + "step": 150 + }, + { + "epoch": 0.4752163650668765, + "grad_norm": 0.22502846697134835, + "learning_rate": 3.9971572048972754e-05, + "loss": 0.6159, + "step": 151 + }, + { + "epoch": 0.4783634933123525, + "grad_norm": 0.2812778266326769, + "learning_rate": 3.997052919237105e-05, + "loss": 0.621, + "step": 152 + }, + { + "epoch": 0.4815106215578285, + "grad_norm": 0.28278818974675024, + "learning_rate": 3.9969467566263115e-05, + "loss": 0.6238, + "step": 153 + }, + { + "epoch": 0.4846577498033045, + "grad_norm": 0.3420285701015734, + "learning_rate": 3.996838717175779e-05, + "loss": 0.6118, + "step": 154 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 0.35539942258827856, + "learning_rate": 3.9967288009983496e-05, + "loss": 0.6168, + "step": 155 + }, + { + "epoch": 0.4909520062942565, + "grad_norm": 0.29368139728658277, + "learning_rate": 3.996617008208827e-05, + "loss": 0.6049, + "step": 156 + }, + { + "epoch": 0.4940991345397325, + "grad_norm": 0.27690796066202816, + "learning_rate": 3.996503338923974e-05, + "loss": 0.6151, + "step": 157 + }, + { + "epoch": 0.4972462627852085, + "grad_norm": 0.41334059184826466, + "learning_rate": 3.9963877932625134e-05, + "loss": 0.6184, + "step": 158 + }, + { + "epoch": 0.5003933910306845, + "grad_norm": 0.42636029329489616, + "learning_rate": 3.996270371345129e-05, + "loss": 0.6144, + "step": 159 + }, + { + "epoch": 0.5035405192761605, + "grad_norm": 0.2810420864283484, + "learning_rate": 3.9961510732944624e-05, + "loss": 0.6185, + "step": 160 + }, + { + "epoch": 0.5066876475216365, + "grad_norm": 0.3032979670400302, + "learning_rate": 3.996029899235116e-05, + "loss": 0.6009, + "step": 161 + }, + { + "epoch": 0.5098347757671126, + "grad_norm": 0.4124798386817749, + "learning_rate": 3.9959068492936517e-05, + "loss": 0.608, + "step": 162 + }, + { + "epoch": 0.5129819040125885, + "grad_norm": 0.46292941599151394, + "learning_rate": 3.99578192359859e-05, + "loss": 0.6303, + "step": 163 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 0.4249023229216657, + "learning_rate": 3.99565512228041e-05, + "loss": 0.6158, + "step": 164 + }, + { + "epoch": 0.5192761605035405, + "grad_norm": 0.3081385660055859, + "learning_rate": 3.9955264454715524e-05, + "loss": 0.604, + "step": 165 + }, + { + "epoch": 0.5224232887490166, + "grad_norm": 0.3115598725592295, + "learning_rate": 3.995395893306414e-05, + "loss": 0.6132, + "step": 166 + }, + { + "epoch": 0.5255704169944925, + "grad_norm": 0.423339283046509, + "learning_rate": 3.995263465921351e-05, + "loss": 0.6133, + "step": 167 + }, + { + "epoch": 0.5287175452399685, + "grad_norm": 0.38579662867406184, + "learning_rate": 3.9951291634546784e-05, + "loss": 0.6046, + "step": 168 + }, + { + "epoch": 0.5318646734854445, + "grad_norm": 0.2646661405581634, + "learning_rate": 3.9949929860466715e-05, + "loss": 0.6065, + "step": 169 + }, + { + "epoch": 0.5350118017309206, + "grad_norm": 0.29299500671952294, + "learning_rate": 3.994854933839561e-05, + "loss": 0.597, + "step": 170 + }, + { + "epoch": 0.5381589299763966, + "grad_norm": 0.3801312863346971, + "learning_rate": 3.994715006977536e-05, + "loss": 0.609, + "step": 171 + }, + { + "epoch": 0.5413060582218725, + "grad_norm": 0.2636893704090895, + "learning_rate": 3.994573205606747e-05, + "loss": 0.6059, + "step": 172 + }, + { + "epoch": 0.5444531864673485, + "grad_norm": 0.2587055020132136, + "learning_rate": 3.994429529875298e-05, + "loss": 0.5968, + "step": 173 + }, + { + "epoch": 0.5476003147128246, + "grad_norm": 0.3153597828303425, + "learning_rate": 3.994283979933254e-05, + "loss": 0.6133, + "step": 174 + }, + { + "epoch": 0.5507474429583006, + "grad_norm": 0.26775970858600634, + "learning_rate": 3.994136555932635e-05, + "loss": 0.6045, + "step": 175 + }, + { + "epoch": 0.5538945712037766, + "grad_norm": 0.3094235529607136, + "learning_rate": 3.993987258027419e-05, + "loss": 0.6089, + "step": 176 + }, + { + "epoch": 0.5570416994492525, + "grad_norm": 0.28763348823077006, + "learning_rate": 3.9938360863735435e-05, + "loss": 0.609, + "step": 177 + }, + { + "epoch": 0.5601888276947286, + "grad_norm": 0.2177547953667043, + "learning_rate": 3.9936830411289e-05, + "loss": 0.6154, + "step": 178 + }, + { + "epoch": 0.5633359559402046, + "grad_norm": 0.2817386027141762, + "learning_rate": 3.993528122453339e-05, + "loss": 0.6119, + "step": 179 + }, + { + "epoch": 0.5664830841856806, + "grad_norm": 0.2844686972678976, + "learning_rate": 3.993371330508666e-05, + "loss": 0.5981, + "step": 180 + }, + { + "epoch": 0.5696302124311565, + "grad_norm": 0.2448670332544363, + "learning_rate": 3.9932126654586446e-05, + "loss": 0.5915, + "step": 181 + }, + { + "epoch": 0.5727773406766326, + "grad_norm": 0.2597177617957836, + "learning_rate": 3.993052127468994e-05, + "loss": 0.5928, + "step": 182 + }, + { + "epoch": 0.5759244689221086, + "grad_norm": 0.2215768221036163, + "learning_rate": 3.99288971670739e-05, + "loss": 0.6161, + "step": 183 + }, + { + "epoch": 0.5790715971675846, + "grad_norm": 0.2699731828037381, + "learning_rate": 3.9927254333434656e-05, + "loss": 0.5921, + "step": 184 + }, + { + "epoch": 0.5822187254130606, + "grad_norm": 0.29227902549722135, + "learning_rate": 3.9925592775488046e-05, + "loss": 0.5976, + "step": 185 + }, + { + "epoch": 0.5853658536585366, + "grad_norm": 0.2541877399622803, + "learning_rate": 3.9923912494969536e-05, + "loss": 0.6102, + "step": 186 + }, + { + "epoch": 0.5885129819040126, + "grad_norm": 0.3043738596400513, + "learning_rate": 3.9922213493634096e-05, + "loss": 0.611, + "step": 187 + }, + { + "epoch": 0.5916601101494886, + "grad_norm": 0.2769382151889082, + "learning_rate": 3.992049577325627e-05, + "loss": 0.609, + "step": 188 + }, + { + "epoch": 0.5948072383949646, + "grad_norm": 0.23411388895804314, + "learning_rate": 3.991875933563014e-05, + "loss": 0.5983, + "step": 189 + }, + { + "epoch": 0.5979543666404405, + "grad_norm": 0.31989694156952164, + "learning_rate": 3.991700418256936e-05, + "loss": 0.6045, + "step": 190 + }, + { + "epoch": 0.6011014948859166, + "grad_norm": 0.38404257854635715, + "learning_rate": 3.991523031590711e-05, + "loss": 0.6063, + "step": 191 + }, + { + "epoch": 0.6042486231313926, + "grad_norm": 0.33761081359143924, + "learning_rate": 3.9913437737496135e-05, + "loss": 0.5951, + "step": 192 + }, + { + "epoch": 0.6073957513768686, + "grad_norm": 0.2381342715991919, + "learning_rate": 3.9911626449208694e-05, + "loss": 0.601, + "step": 193 + }, + { + "epoch": 0.6105428796223447, + "grad_norm": 0.31880643686538623, + "learning_rate": 3.9909796452936616e-05, + "loss": 0.6009, + "step": 194 + }, + { + "epoch": 0.6136900078678206, + "grad_norm": 0.3563025725018504, + "learning_rate": 3.990794775059126e-05, + "loss": 0.6009, + "step": 195 + }, + { + "epoch": 0.6168371361132966, + "grad_norm": 0.3033415317564058, + "learning_rate": 3.9906080344103516e-05, + "loss": 0.5992, + "step": 196 + }, + { + "epoch": 0.6199842643587726, + "grad_norm": 0.2775053050931378, + "learning_rate": 3.990419423542383e-05, + "loss": 0.5987, + "step": 197 + }, + { + "epoch": 0.6231313926042487, + "grad_norm": 0.2614901374015711, + "learning_rate": 3.990228942652215e-05, + "loss": 0.5918, + "step": 198 + }, + { + "epoch": 0.6262785208497246, + "grad_norm": 0.2977635557001149, + "learning_rate": 3.9900365919387985e-05, + "loss": 0.6046, + "step": 199 + }, + { + "epoch": 0.6294256490952006, + "grad_norm": 0.30438529335477493, + "learning_rate": 3.9898423716030364e-05, + "loss": 0.5966, + "step": 200 + }, + { + "epoch": 0.6325727773406766, + "grad_norm": 0.28279872927198246, + "learning_rate": 3.989646281847783e-05, + "loss": 0.5943, + "step": 201 + }, + { + "epoch": 0.6357199055861527, + "grad_norm": 0.25795220306495825, + "learning_rate": 3.989448322877848e-05, + "loss": 0.5989, + "step": 202 + }, + { + "epoch": 0.6388670338316287, + "grad_norm": 0.280857506484411, + "learning_rate": 3.98924849489999e-05, + "loss": 0.595, + "step": 203 + }, + { + "epoch": 0.6420141620771046, + "grad_norm": 0.28147222245655734, + "learning_rate": 3.989046798122922e-05, + "loss": 0.5968, + "step": 204 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.22772337446133548, + "learning_rate": 3.988843232757308e-05, + "loss": 0.5895, + "step": 205 + }, + { + "epoch": 0.6483084185680567, + "grad_norm": 0.24725422874191516, + "learning_rate": 3.9886377990157645e-05, + "loss": 0.5915, + "step": 206 + }, + { + "epoch": 0.6514555468135327, + "grad_norm": 0.2478883381265632, + "learning_rate": 3.988430497112859e-05, + "loss": 0.5946, + "step": 207 + }, + { + "epoch": 0.6546026750590087, + "grad_norm": 0.20803618305397573, + "learning_rate": 3.988221327265111e-05, + "loss": 0.6081, + "step": 208 + }, + { + "epoch": 0.6577498033044846, + "grad_norm": 0.25475933367836384, + "learning_rate": 3.988010289690987e-05, + "loss": 0.6017, + "step": 209 + }, + { + "epoch": 0.6608969315499607, + "grad_norm": 0.24824235295137567, + "learning_rate": 3.987797384610911e-05, + "loss": 0.6028, + "step": 210 + }, + { + "epoch": 0.6640440597954367, + "grad_norm": 0.23024676822564225, + "learning_rate": 3.9875826122472514e-05, + "loss": 0.5947, + "step": 211 + }, + { + "epoch": 0.6671911880409127, + "grad_norm": 0.27973892618861823, + "learning_rate": 3.987365972824331e-05, + "loss": 0.5977, + "step": 212 + }, + { + "epoch": 0.6703383162863886, + "grad_norm": 0.21516896519325748, + "learning_rate": 3.98714746656842e-05, + "loss": 0.601, + "step": 213 + }, + { + "epoch": 0.6734854445318647, + "grad_norm": 0.20803886239420252, + "learning_rate": 3.98692709370774e-05, + "loss": 0.5969, + "step": 214 + }, + { + "epoch": 0.6766325727773407, + "grad_norm": 0.238940879654807, + "learning_rate": 3.986704854472462e-05, + "loss": 0.5985, + "step": 215 + }, + { + "epoch": 0.6797797010228167, + "grad_norm": 0.24651655899332123, + "learning_rate": 3.9864807490947056e-05, + "loss": 0.5984, + "step": 216 + }, + { + "epoch": 0.6829268292682927, + "grad_norm": 0.26084076415464624, + "learning_rate": 3.98625477780854e-05, + "loss": 0.5932, + "step": 217 + }, + { + "epoch": 0.6860739575137687, + "grad_norm": 0.2386108054460097, + "learning_rate": 3.9860269408499844e-05, + "loss": 0.5842, + "step": 218 + }, + { + "epoch": 0.6892210857592447, + "grad_norm": 0.2844335881193996, + "learning_rate": 3.9857972384570035e-05, + "loss": 0.595, + "step": 219 + }, + { + "epoch": 0.6923682140047207, + "grad_norm": 0.2700792716329994, + "learning_rate": 3.985565670869513e-05, + "loss": 0.5965, + "step": 220 + }, + { + "epoch": 0.6955153422501967, + "grad_norm": 0.25960095481333906, + "learning_rate": 3.985332238329378e-05, + "loss": 0.5916, + "step": 221 + }, + { + "epoch": 0.6986624704956726, + "grad_norm": 0.28288643057296725, + "learning_rate": 3.9850969410804065e-05, + "loss": 0.5995, + "step": 222 + }, + { + "epoch": 0.7018095987411487, + "grad_norm": 0.24327917475329708, + "learning_rate": 3.98485977936836e-05, + "loss": 0.5959, + "step": 223 + }, + { + "epoch": 0.7049567269866247, + "grad_norm": 0.2721668752895481, + "learning_rate": 3.984620753440943e-05, + "loss": 0.5994, + "step": 224 + }, + { + "epoch": 0.7081038552321007, + "grad_norm": 0.2607076361644052, + "learning_rate": 3.984379863547808e-05, + "loss": 0.5943, + "step": 225 + }, + { + "epoch": 0.7112509834775768, + "grad_norm": 0.32932315230284676, + "learning_rate": 3.984137109940556e-05, + "loss": 0.5918, + "step": 226 + }, + { + "epoch": 0.7143981117230527, + "grad_norm": 0.2696383359131296, + "learning_rate": 3.983892492872733e-05, + "loss": 0.5906, + "step": 227 + }, + { + "epoch": 0.7175452399685287, + "grad_norm": 0.3585597053241887, + "learning_rate": 3.9836460125998334e-05, + "loss": 0.5948, + "step": 228 + }, + { + "epoch": 0.7206923682140047, + "grad_norm": 0.2909815414499778, + "learning_rate": 3.9833976693792937e-05, + "loss": 0.5967, + "step": 229 + }, + { + "epoch": 0.7238394964594808, + "grad_norm": 0.26425876255452163, + "learning_rate": 3.9831474634705005e-05, + "loss": 0.5935, + "step": 230 + }, + { + "epoch": 0.7269866247049567, + "grad_norm": 0.2425151164990489, + "learning_rate": 3.982895395134782e-05, + "loss": 0.589, + "step": 231 + }, + { + "epoch": 0.7301337529504327, + "grad_norm": 0.2616193102994722, + "learning_rate": 3.982641464635416e-05, + "loss": 0.6018, + "step": 232 + }, + { + "epoch": 0.7332808811959087, + "grad_norm": 0.24405998282776664, + "learning_rate": 3.982385672237621e-05, + "loss": 0.5784, + "step": 233 + }, + { + "epoch": 0.7364280094413848, + "grad_norm": 0.21832468260666305, + "learning_rate": 3.9821280182085625e-05, + "loss": 0.6015, + "step": 234 + }, + { + "epoch": 0.7395751376868608, + "grad_norm": 0.25650060654854345, + "learning_rate": 3.98186850281735e-05, + "loss": 0.5913, + "step": 235 + }, + { + "epoch": 0.7427222659323367, + "grad_norm": 0.27580046393197283, + "learning_rate": 3.981607126335038e-05, + "loss": 0.5895, + "step": 236 + }, + { + "epoch": 0.7458693941778127, + "grad_norm": 0.2565257806459118, + "learning_rate": 3.981343889034622e-05, + "loss": 0.5919, + "step": 237 + }, + { + "epoch": 0.7490165224232888, + "grad_norm": 0.28129400118590575, + "learning_rate": 3.981078791191044e-05, + "loss": 0.5824, + "step": 238 + }, + { + "epoch": 0.7521636506687648, + "grad_norm": 0.27891204249277274, + "learning_rate": 3.980811833081189e-05, + "loss": 0.592, + "step": 239 + }, + { + "epoch": 0.7553107789142408, + "grad_norm": 0.23957189182523364, + "learning_rate": 3.9805430149838826e-05, + "loss": 0.5923, + "step": 240 + }, + { + "epoch": 0.7584579071597167, + "grad_norm": 0.267527485388114, + "learning_rate": 3.980272337179895e-05, + "loss": 0.5915, + "step": 241 + }, + { + "epoch": 0.7616050354051928, + "grad_norm": 0.2893661929586173, + "learning_rate": 3.97999979995194e-05, + "loss": 0.5911, + "step": 242 + }, + { + "epoch": 0.7647521636506688, + "grad_norm": 0.28637666161476577, + "learning_rate": 3.97972540358467e-05, + "loss": 0.5752, + "step": 243 + }, + { + "epoch": 0.7678992918961448, + "grad_norm": 0.22704240396877157, + "learning_rate": 3.979449148364682e-05, + "loss": 0.5755, + "step": 244 + }, + { + "epoch": 0.7710464201416207, + "grad_norm": 0.25773571819059654, + "learning_rate": 3.979171034580514e-05, + "loss": 0.5983, + "step": 245 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.3238069584967583, + "learning_rate": 3.9788910625226435e-05, + "loss": 0.5841, + "step": 246 + }, + { + "epoch": 0.7773406766325728, + "grad_norm": 0.3222043091055347, + "learning_rate": 3.978609232483491e-05, + "loss": 0.59, + "step": 247 + }, + { + "epoch": 0.7804878048780488, + "grad_norm": 0.23894790835594737, + "learning_rate": 3.978325544757419e-05, + "loss": 0.5855, + "step": 248 + }, + { + "epoch": 0.7836349331235248, + "grad_norm": 0.3065610131625477, + "learning_rate": 3.9780399996407235e-05, + "loss": 0.5872, + "step": 249 + }, + { + "epoch": 0.7867820613690008, + "grad_norm": 0.41439567088342205, + "learning_rate": 3.977752597431649e-05, + "loss": 0.5922, + "step": 250 + }, + { + "epoch": 0.7899291896144768, + "grad_norm": 0.33164301867172113, + "learning_rate": 3.977463338430375e-05, + "loss": 0.5966, + "step": 251 + }, + { + "epoch": 0.7930763178599528, + "grad_norm": 0.338521625558078, + "learning_rate": 3.977172222939019e-05, + "loss": 0.5907, + "step": 252 + }, + { + "epoch": 0.7962234461054288, + "grad_norm": 0.4350142509786572, + "learning_rate": 3.976879251261641e-05, + "loss": 0.5841, + "step": 253 + }, + { + "epoch": 0.7993705743509048, + "grad_norm": 0.36993049984822834, + "learning_rate": 3.9765844237042385e-05, + "loss": 0.5864, + "step": 254 + }, + { + "epoch": 0.8025177025963808, + "grad_norm": 0.313011441001989, + "learning_rate": 3.976287740574748e-05, + "loss": 0.5955, + "step": 255 + }, + { + "epoch": 0.8056648308418568, + "grad_norm": 0.3540292456063131, + "learning_rate": 3.975989202183041e-05, + "loss": 0.5957, + "step": 256 + }, + { + "epoch": 0.8088119590873328, + "grad_norm": 0.33646403434651423, + "learning_rate": 3.9756888088409314e-05, + "loss": 0.5847, + "step": 257 + }, + { + "epoch": 0.8119590873328089, + "grad_norm": 0.26982248963868355, + "learning_rate": 3.975386560862166e-05, + "loss": 0.5885, + "step": 258 + }, + { + "epoch": 0.8151062155782848, + "grad_norm": 0.29403731858846366, + "learning_rate": 3.975082458562433e-05, + "loss": 0.5897, + "step": 259 + }, + { + "epoch": 0.8182533438237608, + "grad_norm": 0.4082653715355324, + "learning_rate": 3.974776502259354e-05, + "loss": 0.5791, + "step": 260 + }, + { + "epoch": 0.8214004720692368, + "grad_norm": 0.3510739201797744, + "learning_rate": 3.9744686922724876e-05, + "loss": 0.593, + "step": 261 + }, + { + "epoch": 0.8245476003147129, + "grad_norm": 0.3513487612479217, + "learning_rate": 3.97415902892333e-05, + "loss": 0.5836, + "step": 262 + }, + { + "epoch": 0.8276947285601888, + "grad_norm": 0.28586935653924184, + "learning_rate": 3.973847512535313e-05, + "loss": 0.5826, + "step": 263 + }, + { + "epoch": 0.8308418568056648, + "grad_norm": 0.2318470192789233, + "learning_rate": 3.973534143433802e-05, + "loss": 0.5814, + "step": 264 + }, + { + "epoch": 0.8339889850511408, + "grad_norm": 0.29154683610806104, + "learning_rate": 3.9732189219460994e-05, + "loss": 0.5797, + "step": 265 + }, + { + "epoch": 0.8371361132966169, + "grad_norm": 0.31587273712728664, + "learning_rate": 3.972901848401441e-05, + "loss": 0.5831, + "step": 266 + }, + { + "epoch": 0.8402832415420929, + "grad_norm": 0.23591831976720817, + "learning_rate": 3.972582923130998e-05, + "loss": 0.5737, + "step": 267 + }, + { + "epoch": 0.8434303697875688, + "grad_norm": 0.25691795262588335, + "learning_rate": 3.972262146467874e-05, + "loss": 0.5786, + "step": 268 + }, + { + "epoch": 0.8465774980330448, + "grad_norm": 0.32432381849212155, + "learning_rate": 3.971939518747109e-05, + "loss": 0.593, + "step": 269 + }, + { + "epoch": 0.8497246262785209, + "grad_norm": 0.28695101570482007, + "learning_rate": 3.9716150403056746e-05, + "loss": 0.5796, + "step": 270 + }, + { + "epoch": 0.8528717545239969, + "grad_norm": 0.2724926989579401, + "learning_rate": 3.971288711482476e-05, + "loss": 0.5741, + "step": 271 + }, + { + "epoch": 0.8560188827694729, + "grad_norm": 0.2806703214608174, + "learning_rate": 3.970960532618349e-05, + "loss": 0.5836, + "step": 272 + }, + { + "epoch": 0.8591660110149488, + "grad_norm": 0.24677303795151184, + "learning_rate": 3.9706305040560644e-05, + "loss": 0.5818, + "step": 273 + }, + { + "epoch": 0.8623131392604249, + "grad_norm": 0.2771238866050482, + "learning_rate": 3.9702986261403255e-05, + "loss": 0.5781, + "step": 274 + }, + { + "epoch": 0.8654602675059009, + "grad_norm": 0.2924337657823486, + "learning_rate": 3.9699648992177626e-05, + "loss": 0.5756, + "step": 275 + }, + { + "epoch": 0.8686073957513769, + "grad_norm": 0.25885016469830363, + "learning_rate": 3.969629323636944e-05, + "loss": 0.5844, + "step": 276 + }, + { + "epoch": 0.8717545239968528, + "grad_norm": 0.23987091757624832, + "learning_rate": 3.9692918997483614e-05, + "loss": 0.5733, + "step": 277 + }, + { + "epoch": 0.8749016522423289, + "grad_norm": 0.3337090298700013, + "learning_rate": 3.968952627904443e-05, + "loss": 0.571, + "step": 278 + }, + { + "epoch": 0.8780487804878049, + "grad_norm": 0.29538904985776865, + "learning_rate": 3.9686115084595444e-05, + "loss": 0.5801, + "step": 279 + }, + { + "epoch": 0.8811959087332809, + "grad_norm": 0.2312585310388234, + "learning_rate": 3.968268541769951e-05, + "loss": 0.5835, + "step": 280 + }, + { + "epoch": 0.8843430369787569, + "grad_norm": 0.24512094230884698, + "learning_rate": 3.967923728193878e-05, + "loss": 0.5854, + "step": 281 + }, + { + "epoch": 0.8874901652242329, + "grad_norm": 0.2903110478982194, + "learning_rate": 3.96757706809147e-05, + "loss": 0.5837, + "step": 282 + }, + { + "epoch": 0.8906372934697089, + "grad_norm": 0.34977008813511384, + "learning_rate": 3.967228561824798e-05, + "loss": 0.5767, + "step": 283 + }, + { + "epoch": 0.8937844217151849, + "grad_norm": 0.25337253280718625, + "learning_rate": 3.9668782097578656e-05, + "loss": 0.5839, + "step": 284 + }, + { + "epoch": 0.8969315499606609, + "grad_norm": 0.21636074579356032, + "learning_rate": 3.9665260122566e-05, + "loss": 0.5804, + "step": 285 + }, + { + "epoch": 0.9000786782061369, + "grad_norm": 0.33233711739245153, + "learning_rate": 3.966171969688858e-05, + "loss": 0.5737, + "step": 286 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 0.2680586022952418, + "learning_rate": 3.965816082424423e-05, + "loss": 0.5729, + "step": 287 + }, + { + "epoch": 0.9063729346970889, + "grad_norm": 0.24065778232025806, + "learning_rate": 3.965458350835005e-05, + "loss": 0.5786, + "step": 288 + }, + { + "epoch": 0.9095200629425649, + "grad_norm": 0.24359799626407486, + "learning_rate": 3.965098775294241e-05, + "loss": 0.5743, + "step": 289 + }, + { + "epoch": 0.912667191188041, + "grad_norm": 0.21669535169553875, + "learning_rate": 3.964737356177692e-05, + "loss": 0.5798, + "step": 290 + }, + { + "epoch": 0.9158143194335169, + "grad_norm": 0.24527385896938736, + "learning_rate": 3.9643740938628485e-05, + "loss": 0.5771, + "step": 291 + }, + { + "epoch": 0.9189614476789929, + "grad_norm": 0.24211076465986903, + "learning_rate": 3.964008988729121e-05, + "loss": 0.5733, + "step": 292 + }, + { + "epoch": 0.9221085759244689, + "grad_norm": 0.2589502093646541, + "learning_rate": 3.9636420411578486e-05, + "loss": 0.5755, + "step": 293 + }, + { + "epoch": 0.925255704169945, + "grad_norm": 0.25126849259271594, + "learning_rate": 3.963273251532294e-05, + "loss": 0.5866, + "step": 294 + }, + { + "epoch": 0.9284028324154209, + "grad_norm": 0.28516274664724456, + "learning_rate": 3.962902620237642e-05, + "loss": 0.5803, + "step": 295 + }, + { + "epoch": 0.9315499606608969, + "grad_norm": 0.23946824983891027, + "learning_rate": 3.9625301476610035e-05, + "loss": 0.588, + "step": 296 + }, + { + "epoch": 0.9346970889063729, + "grad_norm": 0.25073581626718944, + "learning_rate": 3.9621558341914104e-05, + "loss": 0.5811, + "step": 297 + }, + { + "epoch": 0.937844217151849, + "grad_norm": 0.22421588771080747, + "learning_rate": 3.9617796802198193e-05, + "loss": 0.5809, + "step": 298 + }, + { + "epoch": 0.940991345397325, + "grad_norm": 0.28680785833388234, + "learning_rate": 3.961401686139108e-05, + "loss": 0.5762, + "step": 299 + }, + { + "epoch": 0.9441384736428009, + "grad_norm": 0.2912561340919721, + "learning_rate": 3.961021852344075e-05, + "loss": 0.5685, + "step": 300 + }, + { + "epoch": 0.9472856018882769, + "grad_norm": 0.25530982724652573, + "learning_rate": 3.960640179231443e-05, + "loss": 0.571, + "step": 301 + }, + { + "epoch": 0.950432730133753, + "grad_norm": 0.23274023004495994, + "learning_rate": 3.960256667199854e-05, + "loss": 0.5766, + "step": 302 + }, + { + "epoch": 0.953579858379229, + "grad_norm": 0.2526565624008317, + "learning_rate": 3.959871316649872e-05, + "loss": 0.5811, + "step": 303 + }, + { + "epoch": 0.956726986624705, + "grad_norm": 0.2669242519610837, + "learning_rate": 3.959484127983979e-05, + "loss": 0.5786, + "step": 304 + }, + { + "epoch": 0.9598741148701809, + "grad_norm": 0.2272805823487035, + "learning_rate": 3.959095101606579e-05, + "loss": 0.5804, + "step": 305 + }, + { + "epoch": 0.963021243115657, + "grad_norm": 0.32599108271772875, + "learning_rate": 3.958704237923994e-05, + "loss": 0.565, + "step": 306 + }, + { + "epoch": 0.966168371361133, + "grad_norm": 0.31736583791904754, + "learning_rate": 3.958311537344467e-05, + "loss": 0.5744, + "step": 307 + }, + { + "epoch": 0.969315499606609, + "grad_norm": 0.25057991032117954, + "learning_rate": 3.957917000278156e-05, + "loss": 0.5744, + "step": 308 + }, + { + "epoch": 0.9724626278520849, + "grad_norm": 0.3171592672476318, + "learning_rate": 3.9575206271371416e-05, + "loss": 0.5727, + "step": 309 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.2698770129342592, + "learning_rate": 3.957122418335419e-05, + "loss": 0.5684, + "step": 310 + }, + { + "epoch": 0.978756884343037, + "grad_norm": 0.22946805293885594, + "learning_rate": 3.956722374288902e-05, + "loss": 0.5675, + "step": 311 + }, + { + "epoch": 0.981904012588513, + "grad_norm": 0.3173852201061959, + "learning_rate": 3.9563204954154194e-05, + "loss": 0.5691, + "step": 312 + }, + { + "epoch": 0.985051140833989, + "grad_norm": 0.3705950113218817, + "learning_rate": 3.955916782134719e-05, + "loss": 0.5841, + "step": 313 + }, + { + "epoch": 0.988198269079465, + "grad_norm": 0.24571947086560714, + "learning_rate": 3.9555112348684626e-05, + "loss": 0.5763, + "step": 314 + }, + { + "epoch": 0.991345397324941, + "grad_norm": 0.3095154791392979, + "learning_rate": 3.955103854040228e-05, + "loss": 0.5672, + "step": 315 + }, + { + "epoch": 0.994492525570417, + "grad_norm": 0.3430882274581819, + "learning_rate": 3.9546946400755104e-05, + "loss": 0.5623, + "step": 316 + }, + { + "epoch": 0.997639653815893, + "grad_norm": 0.21959520260949345, + "learning_rate": 3.954283593401715e-05, + "loss": 0.5667, + "step": 317 + }, + { + "epoch": 1.003147128245476, + "grad_norm": 0.6182457687415346, + "learning_rate": 3.9538707144481656e-05, + "loss": 1.1053, + "step": 318 + }, + { + "epoch": 1.006294256490952, + "grad_norm": 0.8091197174119846, + "learning_rate": 3.953456003646097e-05, + "loss": 0.5479, + "step": 319 + }, + { + "epoch": 1.009441384736428, + "grad_norm": 0.5734582312017469, + "learning_rate": 3.953039461428659e-05, + "loss": 0.5401, + "step": 320 + }, + { + "epoch": 1.012588512981904, + "grad_norm": 0.5553450254239277, + "learning_rate": 3.952621088230912e-05, + "loss": 0.5444, + "step": 321 + }, + { + "epoch": 1.01573564122738, + "grad_norm": 0.715209019372237, + "learning_rate": 3.9522008844898316e-05, + "loss": 0.5482, + "step": 322 + }, + { + "epoch": 1.018882769472856, + "grad_norm": 0.3194143113644003, + "learning_rate": 3.9517788506443036e-05, + "loss": 0.5486, + "step": 323 + }, + { + "epoch": 1.022029897718332, + "grad_norm": 0.5794083060853625, + "learning_rate": 3.9513549871351244e-05, + "loss": 0.5387, + "step": 324 + }, + { + "epoch": 1.025177025963808, + "grad_norm": 0.4549904471790254, + "learning_rate": 3.950929294405005e-05, + "loss": 0.5417, + "step": 325 + }, + { + "epoch": 1.028324154209284, + "grad_norm": 0.39940752965584325, + "learning_rate": 3.950501772898563e-05, + "loss": 0.5383, + "step": 326 + }, + { + "epoch": 1.03147128245476, + "grad_norm": 0.37376657983185657, + "learning_rate": 3.9500724230623285e-05, + "loss": 0.5438, + "step": 327 + }, + { + "epoch": 1.034618410700236, + "grad_norm": 0.3249939876794576, + "learning_rate": 3.9496412453447396e-05, + "loss": 0.5423, + "step": 328 + }, + { + "epoch": 1.037765538945712, + "grad_norm": 0.362363957521138, + "learning_rate": 3.949208240196145e-05, + "loss": 0.5341, + "step": 329 + }, + { + "epoch": 1.0409126671911881, + "grad_norm": 0.3241126445768653, + "learning_rate": 3.948773408068801e-05, + "loss": 0.5377, + "step": 330 + }, + { + "epoch": 1.044059795436664, + "grad_norm": 0.3368256917676624, + "learning_rate": 3.948336749416873e-05, + "loss": 0.55, + "step": 331 + }, + { + "epoch": 1.04720692368214, + "grad_norm": 0.32581092992360716, + "learning_rate": 3.947898264696433e-05, + "loss": 0.5427, + "step": 332 + }, + { + "epoch": 1.050354051927616, + "grad_norm": 0.31016030261198896, + "learning_rate": 3.947457954365461e-05, + "loss": 0.5355, + "step": 333 + }, + { + "epoch": 1.053501180173092, + "grad_norm": 0.3133942156383092, + "learning_rate": 3.947015818883845e-05, + "loss": 0.5337, + "step": 334 + }, + { + "epoch": 1.056648308418568, + "grad_norm": 0.3012762044302009, + "learning_rate": 3.946571858713376e-05, + "loss": 0.5451, + "step": 335 + }, + { + "epoch": 1.059795436664044, + "grad_norm": 0.32025563813718905, + "learning_rate": 3.946126074317755e-05, + "loss": 0.5263, + "step": 336 + }, + { + "epoch": 1.06294256490952, + "grad_norm": 0.31757922834182667, + "learning_rate": 3.9456784661625845e-05, + "loss": 0.5407, + "step": 337 + }, + { + "epoch": 1.0660896931549961, + "grad_norm": 0.2724133341234047, + "learning_rate": 3.945229034715374e-05, + "loss": 0.5451, + "step": 338 + }, + { + "epoch": 1.0692368214004722, + "grad_norm": 0.31611533863952757, + "learning_rate": 3.944777780445537e-05, + "loss": 0.5314, + "step": 339 + }, + { + "epoch": 1.072383949645948, + "grad_norm": 0.264029405971174, + "learning_rate": 3.94432470382439e-05, + "loss": 0.5395, + "step": 340 + }, + { + "epoch": 1.075531077891424, + "grad_norm": 0.2403700990983929, + "learning_rate": 3.9438698053251545e-05, + "loss": 0.5454, + "step": 341 + }, + { + "epoch": 1.0786782061369, + "grad_norm": 0.26014062877023636, + "learning_rate": 3.943413085422954e-05, + "loss": 0.5303, + "step": 342 + }, + { + "epoch": 1.081825334382376, + "grad_norm": 0.24331842687849348, + "learning_rate": 3.942954544594814e-05, + "loss": 0.5393, + "step": 343 + }, + { + "epoch": 1.084972462627852, + "grad_norm": 0.2246092745659522, + "learning_rate": 3.942494183319662e-05, + "loss": 0.5388, + "step": 344 + }, + { + "epoch": 1.088119590873328, + "grad_norm": 0.26647954408169594, + "learning_rate": 3.942032002078326e-05, + "loss": 0.5412, + "step": 345 + }, + { + "epoch": 1.0912667191188041, + "grad_norm": 0.22770410942233646, + "learning_rate": 3.941568001353539e-05, + "loss": 0.5319, + "step": 346 + }, + { + "epoch": 1.0944138473642802, + "grad_norm": 0.2554265454591065, + "learning_rate": 3.94110218162993e-05, + "loss": 0.5356, + "step": 347 + }, + { + "epoch": 1.0975609756097562, + "grad_norm": 0.29466263774038076, + "learning_rate": 3.9406345433940284e-05, + "loss": 0.5375, + "step": 348 + }, + { + "epoch": 1.100708103855232, + "grad_norm": 0.25834878315712795, + "learning_rate": 3.940165087134264e-05, + "loss": 0.5379, + "step": 349 + }, + { + "epoch": 1.103855232100708, + "grad_norm": 0.25134217558928273, + "learning_rate": 3.939693813340966e-05, + "loss": 0.5249, + "step": 350 + }, + { + "epoch": 1.107002360346184, + "grad_norm": 0.29107674814907275, + "learning_rate": 3.939220722506361e-05, + "loss": 0.5397, + "step": 351 + }, + { + "epoch": 1.11014948859166, + "grad_norm": 0.22038795971485264, + "learning_rate": 3.938745815124574e-05, + "loss": 0.5178, + "step": 352 + }, + { + "epoch": 1.113296616837136, + "grad_norm": 0.20162768321484986, + "learning_rate": 3.938269091691626e-05, + "loss": 0.5424, + "step": 353 + }, + { + "epoch": 1.1164437450826121, + "grad_norm": 0.22798636250647, + "learning_rate": 3.937790552705437e-05, + "loss": 0.5401, + "step": 354 + }, + { + "epoch": 1.1195908733280882, + "grad_norm": 0.24904854499387244, + "learning_rate": 3.9373101986658204e-05, + "loss": 0.5405, + "step": 355 + }, + { + "epoch": 1.1227380015735642, + "grad_norm": 0.21942469794432654, + "learning_rate": 3.936828030074488e-05, + "loss": 0.5375, + "step": 356 + }, + { + "epoch": 1.1258851298190402, + "grad_norm": 0.2072161315607419, + "learning_rate": 3.936344047435046e-05, + "loss": 0.5324, + "step": 357 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 0.22290097704007697, + "learning_rate": 3.935858251252994e-05, + "loss": 0.5408, + "step": 358 + }, + { + "epoch": 1.132179386309992, + "grad_norm": 0.2194104787972607, + "learning_rate": 3.935370642035729e-05, + "loss": 0.5295, + "step": 359 + }, + { + "epoch": 1.135326514555468, + "grad_norm": 0.22998068727153642, + "learning_rate": 3.9348812202925375e-05, + "loss": 0.5299, + "step": 360 + }, + { + "epoch": 1.138473642800944, + "grad_norm": 0.20288897468564898, + "learning_rate": 3.9343899865346015e-05, + "loss": 0.5181, + "step": 361 + }, + { + "epoch": 1.1416207710464201, + "grad_norm": 0.2533470728298271, + "learning_rate": 3.933896941274996e-05, + "loss": 0.5403, + "step": 362 + }, + { + "epoch": 1.1447678992918962, + "grad_norm": 0.24735853692572296, + "learning_rate": 3.933402085028687e-05, + "loss": 0.5275, + "step": 363 + }, + { + "epoch": 1.1479150275373722, + "grad_norm": 0.21213304673435288, + "learning_rate": 3.932905418312531e-05, + "loss": 0.5299, + "step": 364 + }, + { + "epoch": 1.1510621557828482, + "grad_norm": 0.2570391773369148, + "learning_rate": 3.932406941645278e-05, + "loss": 0.5346, + "step": 365 + }, + { + "epoch": 1.1542092840283242, + "grad_norm": 0.20961822348437115, + "learning_rate": 3.931906655547568e-05, + "loss": 0.5329, + "step": 366 + }, + { + "epoch": 1.1573564122738, + "grad_norm": 0.19241742932101327, + "learning_rate": 3.9314045605419286e-05, + "loss": 0.5161, + "step": 367 + }, + { + "epoch": 1.160503540519276, + "grad_norm": 0.1981209025761205, + "learning_rate": 3.930900657152777e-05, + "loss": 0.5285, + "step": 368 + }, + { + "epoch": 1.163650668764752, + "grad_norm": 0.20579170791942883, + "learning_rate": 3.930394945906423e-05, + "loss": 0.5337, + "step": 369 + }, + { + "epoch": 1.1667977970102281, + "grad_norm": 0.23447783712452946, + "learning_rate": 3.929887427331061e-05, + "loss": 0.5319, + "step": 370 + }, + { + "epoch": 1.1699449252557041, + "grad_norm": 0.23240141396634467, + "learning_rate": 3.9293781019567736e-05, + "loss": 0.5216, + "step": 371 + }, + { + "epoch": 1.1730920535011802, + "grad_norm": 0.20717972878459204, + "learning_rate": 3.9288669703155305e-05, + "loss": 0.5285, + "step": 372 + }, + { + "epoch": 1.1762391817466562, + "grad_norm": 0.24921261028100655, + "learning_rate": 3.92835403294119e-05, + "loss": 0.5307, + "step": 373 + }, + { + "epoch": 1.1793863099921322, + "grad_norm": 0.2541092436236979, + "learning_rate": 3.927839290369494e-05, + "loss": 0.529, + "step": 374 + }, + { + "epoch": 1.1825334382376083, + "grad_norm": 0.20270164744890468, + "learning_rate": 3.927322743138071e-05, + "loss": 0.526, + "step": 375 + }, + { + "epoch": 1.185680566483084, + "grad_norm": 0.2399584297754137, + "learning_rate": 3.926804391786433e-05, + "loss": 0.5365, + "step": 376 + }, + { + "epoch": 1.1888276947285603, + "grad_norm": 0.24063071154269497, + "learning_rate": 3.926284236855979e-05, + "loss": 0.5359, + "step": 377 + }, + { + "epoch": 1.1919748229740361, + "grad_norm": 0.2147952601728759, + "learning_rate": 3.92576227888999e-05, + "loss": 0.5369, + "step": 378 + }, + { + "epoch": 1.1951219512195121, + "grad_norm": 0.23183738138121565, + "learning_rate": 3.925238518433629e-05, + "loss": 0.5406, + "step": 379 + }, + { + "epoch": 1.1982690794649882, + "grad_norm": 0.24958601341879394, + "learning_rate": 3.924712956033945e-05, + "loss": 0.5257, + "step": 380 + }, + { + "epoch": 1.2014162077104642, + "grad_norm": 0.2107193249163421, + "learning_rate": 3.9241855922398664e-05, + "loss": 0.5265, + "step": 381 + }, + { + "epoch": 1.2045633359559402, + "grad_norm": 0.21211894634268036, + "learning_rate": 3.923656427602203e-05, + "loss": 0.5209, + "step": 382 + }, + { + "epoch": 1.2077104642014163, + "grad_norm": 0.19007716139862285, + "learning_rate": 3.9231254626736475e-05, + "loss": 0.5248, + "step": 383 + }, + { + "epoch": 1.2108575924468923, + "grad_norm": 0.2106932996483277, + "learning_rate": 3.922592698008771e-05, + "loss": 0.5255, + "step": 384 + }, + { + "epoch": 1.2140047206923683, + "grad_norm": 0.20944881028089596, + "learning_rate": 3.922058134164025e-05, + "loss": 0.5383, + "step": 385 + }, + { + "epoch": 1.2171518489378443, + "grad_norm": 0.19091968823309646, + "learning_rate": 3.9215217716977405e-05, + "loss": 0.5321, + "step": 386 + }, + { + "epoch": 1.2202989771833201, + "grad_norm": 0.24949906631514807, + "learning_rate": 3.9209836111701274e-05, + "loss": 0.5337, + "step": 387 + }, + { + "epoch": 1.2234461054287962, + "grad_norm": 0.24811252472670564, + "learning_rate": 3.9204436531432725e-05, + "loss": 0.5305, + "step": 388 + }, + { + "epoch": 1.2265932336742722, + "grad_norm": 0.21473975761364125, + "learning_rate": 3.9199018981811405e-05, + "loss": 0.5203, + "step": 389 + }, + { + "epoch": 1.2297403619197482, + "grad_norm": 0.21533576750411015, + "learning_rate": 3.919358346849573e-05, + "loss": 0.5433, + "step": 390 + }, + { + "epoch": 1.2328874901652243, + "grad_norm": 0.21369216327652596, + "learning_rate": 3.918812999716288e-05, + "loss": 0.5305, + "step": 391 + }, + { + "epoch": 1.2360346184107003, + "grad_norm": 0.21462424108637662, + "learning_rate": 3.918265857350879e-05, + "loss": 0.5359, + "step": 392 + }, + { + "epoch": 1.2391817466561763, + "grad_norm": 0.24722325499788872, + "learning_rate": 3.917716920324815e-05, + "loss": 0.528, + "step": 393 + }, + { + "epoch": 1.2423288749016523, + "grad_norm": 0.2011641071003589, + "learning_rate": 3.917166189211438e-05, + "loss": 0.5314, + "step": 394 + }, + { + "epoch": 1.2454760031471284, + "grad_norm": 0.2129227867593859, + "learning_rate": 3.916613664585966e-05, + "loss": 0.536, + "step": 395 + }, + { + "epoch": 1.2486231313926042, + "grad_norm": 0.22745233172578577, + "learning_rate": 3.9160593470254884e-05, + "loss": 0.5313, + "step": 396 + }, + { + "epoch": 1.2517702596380802, + "grad_norm": 0.21406731944348806, + "learning_rate": 3.915503237108967e-05, + "loss": 0.5225, + "step": 397 + }, + { + "epoch": 1.2549173878835562, + "grad_norm": 0.23441772239476547, + "learning_rate": 3.9149453354172387e-05, + "loss": 0.5288, + "step": 398 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 0.23499009277537974, + "learning_rate": 3.914385642533008e-05, + "loss": 0.5334, + "step": 399 + }, + { + "epoch": 1.2612116443745083, + "grad_norm": 0.19728137418451414, + "learning_rate": 3.913824159040853e-05, + "loss": 0.529, + "step": 400 + }, + { + "epoch": 1.2643587726199843, + "grad_norm": 0.24085359726093897, + "learning_rate": 3.913260885527221e-05, + "loss": 0.5276, + "step": 401 + }, + { + "epoch": 1.2675059008654603, + "grad_norm": 0.25321890170169037, + "learning_rate": 3.912695822580428e-05, + "loss": 0.5317, + "step": 402 + }, + { + "epoch": 1.2706530291109361, + "grad_norm": 0.21705384607485445, + "learning_rate": 3.912128970790659e-05, + "loss": 0.5232, + "step": 403 + }, + { + "epoch": 1.2738001573564124, + "grad_norm": 0.20299880408423301, + "learning_rate": 3.911560330749971e-05, + "loss": 0.5366, + "step": 404 + }, + { + "epoch": 1.2769472856018882, + "grad_norm": 0.22689509092035035, + "learning_rate": 3.9109899030522846e-05, + "loss": 0.5304, + "step": 405 + }, + { + "epoch": 1.2800944138473642, + "grad_norm": 0.20734100246332385, + "learning_rate": 3.910417688293389e-05, + "loss": 0.5308, + "step": 406 + }, + { + "epoch": 1.2832415420928402, + "grad_norm": 0.2079086103791129, + "learning_rate": 3.909843687070939e-05, + "loss": 0.533, + "step": 407 + }, + { + "epoch": 1.2863886703383163, + "grad_norm": 0.2383801744359338, + "learning_rate": 3.9092678999844575e-05, + "loss": 0.5261, + "step": 408 + }, + { + "epoch": 1.2895357985837923, + "grad_norm": 0.26103176241188575, + "learning_rate": 3.90869032763533e-05, + "loss": 0.525, + "step": 409 + }, + { + "epoch": 1.2926829268292683, + "grad_norm": 0.23498553551570836, + "learning_rate": 3.90811097062681e-05, + "loss": 0.5257, + "step": 410 + }, + { + "epoch": 1.2958300550747444, + "grad_norm": 0.22833679759966044, + "learning_rate": 3.90752982956401e-05, + "loss": 0.5328, + "step": 411 + }, + { + "epoch": 1.2989771833202202, + "grad_norm": 0.25544426763785694, + "learning_rate": 3.906946905053912e-05, + "loss": 0.5238, + "step": 412 + }, + { + "epoch": 1.3021243115656964, + "grad_norm": 0.2672450987154701, + "learning_rate": 3.906362197705355e-05, + "loss": 0.5389, + "step": 413 + }, + { + "epoch": 1.3052714398111722, + "grad_norm": 0.2619780531405743, + "learning_rate": 3.905775708129045e-05, + "loss": 0.5289, + "step": 414 + }, + { + "epoch": 1.3084185680566482, + "grad_norm": 0.25026525055288934, + "learning_rate": 3.905187436937545e-05, + "loss": 0.5251, + "step": 415 + }, + { + "epoch": 1.3115656963021243, + "grad_norm": 0.2615761605038083, + "learning_rate": 3.904597384745282e-05, + "loss": 0.5232, + "step": 416 + }, + { + "epoch": 1.3147128245476003, + "grad_norm": 0.2209468332287666, + "learning_rate": 3.904005552168541e-05, + "loss": 0.5247, + "step": 417 + }, + { + "epoch": 1.3178599527930763, + "grad_norm": 0.27687648105272317, + "learning_rate": 3.9034119398254703e-05, + "loss": 0.5394, + "step": 418 + }, + { + "epoch": 1.3210070810385524, + "grad_norm": 0.264646195521817, + "learning_rate": 3.902816548336072e-05, + "loss": 0.5195, + "step": 419 + }, + { + "epoch": 1.3241542092840284, + "grad_norm": 0.24319496232874022, + "learning_rate": 3.90221937832221e-05, + "loss": 0.5297, + "step": 420 + }, + { + "epoch": 1.3273013375295044, + "grad_norm": 0.2682903371006935, + "learning_rate": 3.901620430407605e-05, + "loss": 0.5288, + "step": 421 + }, + { + "epoch": 1.3304484657749804, + "grad_norm": 0.25696926111301793, + "learning_rate": 3.9010197052178334e-05, + "loss": 0.5321, + "step": 422 + }, + { + "epoch": 1.3335955940204562, + "grad_norm": 0.21737804255664558, + "learning_rate": 3.9004172033803294e-05, + "loss": 0.527, + "step": 423 + }, + { + "epoch": 1.3367427222659323, + "grad_norm": 0.25367064453625227, + "learning_rate": 3.899812925524382e-05, + "loss": 0.5294, + "step": 424 + }, + { + "epoch": 1.3398898505114083, + "grad_norm": 0.23247403805521405, + "learning_rate": 3.8992068722811366e-05, + "loss": 0.5268, + "step": 425 + }, + { + "epoch": 1.3430369787568843, + "grad_norm": 0.20969217116963346, + "learning_rate": 3.89859904428359e-05, + "loss": 0.525, + "step": 426 + }, + { + "epoch": 1.3461841070023604, + "grad_norm": 0.21944665805836538, + "learning_rate": 3.897989442166597e-05, + "loss": 0.5303, + "step": 427 + }, + { + "epoch": 1.3493312352478364, + "grad_norm": 0.20387874014114413, + "learning_rate": 3.89737806656686e-05, + "loss": 0.5294, + "step": 428 + }, + { + "epoch": 1.3524783634933124, + "grad_norm": 0.23174913286284687, + "learning_rate": 3.8967649181229384e-05, + "loss": 0.5279, + "step": 429 + }, + { + "epoch": 1.3556254917387884, + "grad_norm": 0.2435719017940357, + "learning_rate": 3.896149997475241e-05, + "loss": 0.5246, + "step": 430 + }, + { + "epoch": 1.3587726199842645, + "grad_norm": 0.22427280573004413, + "learning_rate": 3.895533305266029e-05, + "loss": 0.5175, + "step": 431 + }, + { + "epoch": 1.3619197482297403, + "grad_norm": 0.23620945890539627, + "learning_rate": 3.894914842139411e-05, + "loss": 0.5263, + "step": 432 + }, + { + "epoch": 1.3650668764752163, + "grad_norm": 0.3013716827125622, + "learning_rate": 3.894294608741349e-05, + "loss": 0.5301, + "step": 433 + }, + { + "epoch": 1.3682140047206923, + "grad_norm": 0.2147253729034818, + "learning_rate": 3.893672605719651e-05, + "loss": 0.5324, + "step": 434 + }, + { + "epoch": 1.3713611329661684, + "grad_norm": 0.2818860617617866, + "learning_rate": 3.893048833723976e-05, + "loss": 0.5257, + "step": 435 + }, + { + "epoch": 1.3745082612116444, + "grad_norm": 0.29077510474579804, + "learning_rate": 3.892423293405828e-05, + "loss": 0.5329, + "step": 436 + }, + { + "epoch": 1.3776553894571204, + "grad_norm": 0.24939201266271596, + "learning_rate": 3.891795985418559e-05, + "loss": 0.5249, + "step": 437 + }, + { + "epoch": 1.3808025177025964, + "grad_norm": 0.2715447424635029, + "learning_rate": 3.891166910417368e-05, + "loss": 0.5176, + "step": 438 + }, + { + "epoch": 1.3839496459480725, + "grad_norm": 0.31564837515854355, + "learning_rate": 3.890536069059299e-05, + "loss": 0.5247, + "step": 439 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 0.28321629275081395, + "learning_rate": 3.88990346200324e-05, + "loss": 0.5292, + "step": 440 + }, + { + "epoch": 1.3902439024390243, + "grad_norm": 0.255553001892679, + "learning_rate": 3.889269089909924e-05, + "loss": 0.5274, + "step": 441 + }, + { + "epoch": 1.3933910306845003, + "grad_norm": 0.22898176345383003, + "learning_rate": 3.888632953441929e-05, + "loss": 0.5278, + "step": 442 + }, + { + "epoch": 1.3965381589299763, + "grad_norm": 0.25102215567626407, + "learning_rate": 3.887995053263673e-05, + "loss": 0.5263, + "step": 443 + }, + { + "epoch": 1.3996852871754524, + "grad_norm": 0.20971240214227582, + "learning_rate": 3.887355390041418e-05, + "loss": 0.5226, + "step": 444 + }, + { + "epoch": 1.4028324154209284, + "grad_norm": 0.20633909928885785, + "learning_rate": 3.886713964443266e-05, + "loss": 0.5289, + "step": 445 + }, + { + "epoch": 1.4059795436664044, + "grad_norm": 0.2574749675063496, + "learning_rate": 3.886070777139163e-05, + "loss": 0.5312, + "step": 446 + }, + { + "epoch": 1.4091266719118805, + "grad_norm": 0.261674083730878, + "learning_rate": 3.88542582880089e-05, + "loss": 0.536, + "step": 447 + }, + { + "epoch": 1.4122738001573565, + "grad_norm": 0.23684316877305508, + "learning_rate": 3.884779120102071e-05, + "loss": 0.5305, + "step": 448 + }, + { + "epoch": 1.4154209284028325, + "grad_norm": 0.2641611884928782, + "learning_rate": 3.884130651718168e-05, + "loss": 0.5242, + "step": 449 + }, + { + "epoch": 1.4185680566483083, + "grad_norm": 0.26330433265613506, + "learning_rate": 3.883480424326481e-05, + "loss": 0.5336, + "step": 450 + }, + { + "epoch": 1.4217151848937843, + "grad_norm": 0.24912300302600038, + "learning_rate": 3.882828438606145e-05, + "loss": 0.5349, + "step": 451 + }, + { + "epoch": 1.4248623131392604, + "grad_norm": 0.20605940059141603, + "learning_rate": 3.882174695238135e-05, + "loss": 0.5287, + "step": 452 + }, + { + "epoch": 1.4280094413847364, + "grad_norm": 0.2693403536580001, + "learning_rate": 3.8815191949052586e-05, + "loss": 0.5367, + "step": 453 + }, + { + "epoch": 1.4311565696302124, + "grad_norm": 0.314210855668854, + "learning_rate": 3.880861938292162e-05, + "loss": 0.523, + "step": 454 + }, + { + "epoch": 1.4343036978756885, + "grad_norm": 0.24090278288346206, + "learning_rate": 3.880202926085321e-05, + "loss": 0.5224, + "step": 455 + }, + { + "epoch": 1.4374508261211645, + "grad_norm": 0.30398502103831454, + "learning_rate": 3.87954215897305e-05, + "loss": 0.5302, + "step": 456 + }, + { + "epoch": 1.4405979543666405, + "grad_norm": 0.3216734104886104, + "learning_rate": 3.8788796376454936e-05, + "loss": 0.5274, + "step": 457 + }, + { + "epoch": 1.4437450826121165, + "grad_norm": 0.22333114204077223, + "learning_rate": 3.878215362794628e-05, + "loss": 0.5239, + "step": 458 + }, + { + "epoch": 1.4468922108575923, + "grad_norm": 0.23789190915199318, + "learning_rate": 3.877549335114263e-05, + "loss": 0.5209, + "step": 459 + }, + { + "epoch": 1.4500393391030684, + "grad_norm": 0.1890051988727759, + "learning_rate": 3.8768815553000376e-05, + "loss": 0.5269, + "step": 460 + }, + { + "epoch": 1.4531864673485444, + "grad_norm": 0.22963562375613875, + "learning_rate": 3.8762120240494223e-05, + "loss": 0.529, + "step": 461 + }, + { + "epoch": 1.4563335955940204, + "grad_norm": 0.24014125362476896, + "learning_rate": 3.875540742061715e-05, + "loss": 0.5249, + "step": 462 + }, + { + "epoch": 1.4594807238394965, + "grad_norm": 0.25487459533999646, + "learning_rate": 3.874867710038044e-05, + "loss": 0.5238, + "step": 463 + }, + { + "epoch": 1.4626278520849725, + "grad_norm": 0.2329939674520019, + "learning_rate": 3.874192928681364e-05, + "loss": 0.5245, + "step": 464 + }, + { + "epoch": 1.4657749803304485, + "grad_norm": 0.2375922581202063, + "learning_rate": 3.873516398696457e-05, + "loss": 0.5253, + "step": 465 + }, + { + "epoch": 1.4689221085759245, + "grad_norm": 0.22120136053113018, + "learning_rate": 3.8728381207899326e-05, + "loss": 0.5322, + "step": 466 + }, + { + "epoch": 1.4720692368214006, + "grad_norm": 0.2575168541935438, + "learning_rate": 3.872158095670225e-05, + "loss": 0.5163, + "step": 467 + }, + { + "epoch": 1.4752163650668764, + "grad_norm": 0.24759034856123785, + "learning_rate": 3.871476324047593e-05, + "loss": 0.5219, + "step": 468 + }, + { + "epoch": 1.4783634933123526, + "grad_norm": 0.2122545341024054, + "learning_rate": 3.870792806634121e-05, + "loss": 0.5219, + "step": 469 + }, + { + "epoch": 1.4815106215578284, + "grad_norm": 0.21548255412844264, + "learning_rate": 3.8701075441437156e-05, + "loss": 0.5139, + "step": 470 + }, + { + "epoch": 1.4846577498033044, + "grad_norm": 0.23209201852334332, + "learning_rate": 3.8694205372921054e-05, + "loss": 0.5255, + "step": 471 + }, + { + "epoch": 1.4878048780487805, + "grad_norm": 0.22357478851651288, + "learning_rate": 3.868731786796843e-05, + "loss": 0.5173, + "step": 472 + }, + { + "epoch": 1.4909520062942565, + "grad_norm": 0.23628716934333172, + "learning_rate": 3.8680412933773007e-05, + "loss": 0.5166, + "step": 473 + }, + { + "epoch": 1.4940991345397325, + "grad_norm": 0.2840109021374993, + "learning_rate": 3.867349057754671e-05, + "loss": 0.5248, + "step": 474 + }, + { + "epoch": 1.4972462627852086, + "grad_norm": 0.19780778754861236, + "learning_rate": 3.8666550806519676e-05, + "loss": 0.5309, + "step": 475 + }, + { + "epoch": 1.5003933910306846, + "grad_norm": 0.23377357292240997, + "learning_rate": 3.8659593627940204e-05, + "loss": 0.5242, + "step": 476 + }, + { + "epoch": 1.5035405192761604, + "grad_norm": 0.27389049583059905, + "learning_rate": 3.8652619049074814e-05, + "loss": 0.5326, + "step": 477 + }, + { + "epoch": 1.5066876475216366, + "grad_norm": 0.2155931645603816, + "learning_rate": 3.8645627077208166e-05, + "loss": 0.5195, + "step": 478 + }, + { + "epoch": 1.5098347757671124, + "grad_norm": 0.22156798104402176, + "learning_rate": 3.8638617719643095e-05, + "loss": 0.5171, + "step": 479 + }, + { + "epoch": 1.5129819040125885, + "grad_norm": 0.23067806483053538, + "learning_rate": 3.8631590983700606e-05, + "loss": 0.5152, + "step": 480 + }, + { + "epoch": 1.5161290322580645, + "grad_norm": 0.1921108745110929, + "learning_rate": 3.8624546876719834e-05, + "loss": 0.5283, + "step": 481 + }, + { + "epoch": 1.5192761605035405, + "grad_norm": 0.21072974454084317, + "learning_rate": 3.861748540605808e-05, + "loss": 0.5171, + "step": 482 + }, + { + "epoch": 1.5224232887490166, + "grad_norm": 0.21970389437346471, + "learning_rate": 3.8610406579090766e-05, + "loss": 0.5219, + "step": 483 + }, + { + "epoch": 1.5255704169944924, + "grad_norm": 0.20711315573580885, + "learning_rate": 3.860331040321145e-05, + "loss": 0.5253, + "step": 484 + }, + { + "epoch": 1.5287175452399686, + "grad_norm": 0.24879575847966118, + "learning_rate": 3.8596196885831804e-05, + "loss": 0.5302, + "step": 485 + }, + { + "epoch": 1.5318646734854444, + "grad_norm": 0.28066766221968237, + "learning_rate": 3.858906603438161e-05, + "loss": 0.5372, + "step": 486 + }, + { + "epoch": 1.5350118017309207, + "grad_norm": 0.23712162618272606, + "learning_rate": 3.8581917856308775e-05, + "loss": 0.53, + "step": 487 + }, + { + "epoch": 1.5381589299763965, + "grad_norm": 0.24813061042956056, + "learning_rate": 3.857475235907928e-05, + "loss": 0.5204, + "step": 488 + }, + { + "epoch": 1.5413060582218725, + "grad_norm": 0.20722206925566874, + "learning_rate": 3.8567569550177195e-05, + "loss": 0.5318, + "step": 489 + }, + { + "epoch": 1.5444531864673485, + "grad_norm": 0.2282801127025407, + "learning_rate": 3.856036943710469e-05, + "loss": 0.5238, + "step": 490 + }, + { + "epoch": 1.5476003147128246, + "grad_norm": 0.25422893533824964, + "learning_rate": 3.8553152027382e-05, + "loss": 0.5204, + "step": 491 + }, + { + "epoch": 1.5507474429583006, + "grad_norm": 0.2259564938535116, + "learning_rate": 3.854591732854741e-05, + "loss": 0.5257, + "step": 492 + }, + { + "epoch": 1.5538945712037766, + "grad_norm": 0.23136885066637908, + "learning_rate": 3.853866534815728e-05, + "loss": 0.5253, + "step": 493 + }, + { + "epoch": 1.5570416994492526, + "grad_norm": 0.25141412317565787, + "learning_rate": 3.853139609378603e-05, + "loss": 0.5215, + "step": 494 + }, + { + "epoch": 1.5601888276947284, + "grad_norm": 0.216386761256824, + "learning_rate": 3.85241095730261e-05, + "loss": 0.5175, + "step": 495 + }, + { + "epoch": 1.5633359559402047, + "grad_norm": 0.24355134099428055, + "learning_rate": 3.8516805793487974e-05, + "loss": 0.519, + "step": 496 + }, + { + "epoch": 1.5664830841856805, + "grad_norm": 0.19505711198842687, + "learning_rate": 3.850948476280015e-05, + "loss": 0.5327, + "step": 497 + }, + { + "epoch": 1.5696302124311565, + "grad_norm": 0.2447172400294907, + "learning_rate": 3.8502146488609164e-05, + "loss": 0.5212, + "step": 498 + }, + { + "epoch": 1.5727773406766326, + "grad_norm": 0.19962243706421512, + "learning_rate": 3.8494790978579565e-05, + "loss": 0.5142, + "step": 499 + }, + { + "epoch": 1.5759244689221086, + "grad_norm": 0.2841517257055549, + "learning_rate": 3.848741824039386e-05, + "loss": 0.5178, + "step": 500 + }, + { + "epoch": 1.5790715971675846, + "grad_norm": 0.20724334543587292, + "learning_rate": 3.8480028281752615e-05, + "loss": 0.5249, + "step": 501 + }, + { + "epoch": 1.5822187254130606, + "grad_norm": 0.28838849111673964, + "learning_rate": 3.8472621110374335e-05, + "loss": 0.5173, + "step": 502 + }, + { + "epoch": 1.5853658536585367, + "grad_norm": 0.24186826741838155, + "learning_rate": 3.8465196733995514e-05, + "loss": 0.5154, + "step": 503 + }, + { + "epoch": 1.5885129819040125, + "grad_norm": 0.1983223561298523, + "learning_rate": 3.8457755160370625e-05, + "loss": 0.509, + "step": 504 + }, + { + "epoch": 1.5916601101494887, + "grad_norm": 0.21649035102901398, + "learning_rate": 3.8450296397272095e-05, + "loss": 0.5321, + "step": 505 + }, + { + "epoch": 1.5948072383949645, + "grad_norm": 0.25388593478131405, + "learning_rate": 3.8442820452490305e-05, + "loss": 0.5249, + "step": 506 + }, + { + "epoch": 1.5979543666404405, + "grad_norm": 0.2642843241520677, + "learning_rate": 3.843532733383358e-05, + "loss": 0.5256, + "step": 507 + }, + { + "epoch": 1.6011014948859166, + "grad_norm": 0.23572275621033986, + "learning_rate": 3.8427817049128194e-05, + "loss": 0.5216, + "step": 508 + }, + { + "epoch": 1.6042486231313926, + "grad_norm": 0.2110591043789785, + "learning_rate": 3.842028960621834e-05, + "loss": 0.5149, + "step": 509 + }, + { + "epoch": 1.6073957513768686, + "grad_norm": 0.22877095520766147, + "learning_rate": 3.841274501296613e-05, + "loss": 0.5235, + "step": 510 + }, + { + "epoch": 1.6105428796223447, + "grad_norm": 0.24251420163715415, + "learning_rate": 3.84051832772516e-05, + "loss": 0.5144, + "step": 511 + }, + { + "epoch": 1.6136900078678207, + "grad_norm": 0.20511713060127806, + "learning_rate": 3.839760440697268e-05, + "loss": 0.5258, + "step": 512 + }, + { + "epoch": 1.6168371361132965, + "grad_norm": 0.23501746426992737, + "learning_rate": 3.83900084100452e-05, + "loss": 0.5218, + "step": 513 + }, + { + "epoch": 1.6199842643587727, + "grad_norm": 0.24300834825222642, + "learning_rate": 3.838239529440287e-05, + "loss": 0.5201, + "step": 514 + }, + { + "epoch": 1.6231313926042485, + "grad_norm": 0.2415867797392267, + "learning_rate": 3.83747650679973e-05, + "loss": 0.5214, + "step": 515 + }, + { + "epoch": 1.6262785208497246, + "grad_norm": 0.24575403339954768, + "learning_rate": 3.836711773879795e-05, + "loss": 0.5262, + "step": 516 + }, + { + "epoch": 1.6294256490952006, + "grad_norm": 0.19675787482506665, + "learning_rate": 3.835945331479216e-05, + "loss": 0.5144, + "step": 517 + }, + { + "epoch": 1.6325727773406766, + "grad_norm": 0.26577039634688837, + "learning_rate": 3.8351771803985115e-05, + "loss": 0.5192, + "step": 518 + }, + { + "epoch": 1.6357199055861527, + "grad_norm": 0.286665435100784, + "learning_rate": 3.8344073214399845e-05, + "loss": 0.5291, + "step": 519 + }, + { + "epoch": 1.6388670338316287, + "grad_norm": 0.18836938646912646, + "learning_rate": 3.833635755407723e-05, + "loss": 0.5109, + "step": 520 + }, + { + "epoch": 1.6420141620771047, + "grad_norm": 0.2650168026961069, + "learning_rate": 3.832862483107597e-05, + "loss": 0.5221, + "step": 521 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 0.2361325083482264, + "learning_rate": 3.832087505347257e-05, + "loss": 0.5219, + "step": 522 + }, + { + "epoch": 1.6483084185680568, + "grad_norm": 0.19034304753715936, + "learning_rate": 3.831310822936139e-05, + "loss": 0.5249, + "step": 523 + }, + { + "epoch": 1.6514555468135326, + "grad_norm": 0.22876829423081257, + "learning_rate": 3.830532436685457e-05, + "loss": 0.5144, + "step": 524 + }, + { + "epoch": 1.6546026750590088, + "grad_norm": 0.19602966120535223, + "learning_rate": 3.829752347408202e-05, + "loss": 0.5137, + "step": 525 + }, + { + "epoch": 1.6577498033044846, + "grad_norm": 0.19990915987444982, + "learning_rate": 3.8289705559191495e-05, + "loss": 0.5188, + "step": 526 + }, + { + "epoch": 1.6608969315499607, + "grad_norm": 0.22044066914833604, + "learning_rate": 3.8281870630348483e-05, + "loss": 0.5147, + "step": 527 + }, + { + "epoch": 1.6640440597954367, + "grad_norm": 0.21061663961165006, + "learning_rate": 3.827401869573626e-05, + "loss": 0.5231, + "step": 528 + }, + { + "epoch": 1.6671911880409127, + "grad_norm": 0.21712763474322638, + "learning_rate": 3.826614976355584e-05, + "loss": 0.5276, + "step": 529 + }, + { + "epoch": 1.6703383162863887, + "grad_norm": 0.23698263316551577, + "learning_rate": 3.825826384202604e-05, + "loss": 0.512, + "step": 530 + }, + { + "epoch": 1.6734854445318645, + "grad_norm": 0.26433812000215734, + "learning_rate": 3.8250360939383384e-05, + "loss": 0.5205, + "step": 531 + }, + { + "epoch": 1.6766325727773408, + "grad_norm": 0.23388260377969083, + "learning_rate": 3.8242441063882145e-05, + "loss": 0.5158, + "step": 532 + }, + { + "epoch": 1.6797797010228166, + "grad_norm": 0.23031333677705712, + "learning_rate": 3.82345042237943e-05, + "loss": 0.5211, + "step": 533 + }, + { + "epoch": 1.6829268292682928, + "grad_norm": 0.2548042458770785, + "learning_rate": 3.822655042740959e-05, + "loss": 0.5198, + "step": 534 + }, + { + "epoch": 1.6860739575137687, + "grad_norm": 0.22638295828893965, + "learning_rate": 3.8218579683035425e-05, + "loss": 0.5238, + "step": 535 + }, + { + "epoch": 1.6892210857592447, + "grad_norm": 0.2262970560082153, + "learning_rate": 3.8210591998996924e-05, + "loss": 0.5202, + "step": 536 + }, + { + "epoch": 1.6923682140047207, + "grad_norm": 0.21411782792791356, + "learning_rate": 3.8202587383636926e-05, + "loss": 0.5222, + "step": 537 + }, + { + "epoch": 1.6955153422501967, + "grad_norm": 0.20447045372047343, + "learning_rate": 3.8194565845315936e-05, + "loss": 0.5173, + "step": 538 + }, + { + "epoch": 1.6986624704956728, + "grad_norm": 0.2162189855266448, + "learning_rate": 3.818652739241211e-05, + "loss": 0.5144, + "step": 539 + }, + { + "epoch": 1.7018095987411486, + "grad_norm": 0.23817213025322223, + "learning_rate": 3.817847203332131e-05, + "loss": 0.5239, + "step": 540 + }, + { + "epoch": 1.7049567269866248, + "grad_norm": 0.2451599843139077, + "learning_rate": 3.8170399776457044e-05, + "loss": 0.5252, + "step": 541 + }, + { + "epoch": 1.7081038552321006, + "grad_norm": 0.21173486339403857, + "learning_rate": 3.816231063025045e-05, + "loss": 0.5144, + "step": 542 + }, + { + "epoch": 1.7112509834775769, + "grad_norm": 0.22798045746972564, + "learning_rate": 3.8154204603150334e-05, + "loss": 0.5246, + "step": 543 + }, + { + "epoch": 1.7143981117230527, + "grad_norm": 0.22874807203605335, + "learning_rate": 3.814608170362311e-05, + "loss": 0.5171, + "step": 544 + }, + { + "epoch": 1.7175452399685287, + "grad_norm": 0.20891847321188745, + "learning_rate": 3.8137941940152834e-05, + "loss": 0.5196, + "step": 545 + }, + { + "epoch": 1.7206923682140047, + "grad_norm": 0.21664552315581692, + "learning_rate": 3.812978532124116e-05, + "loss": 0.5074, + "step": 546 + }, + { + "epoch": 1.7238394964594808, + "grad_norm": 0.20341701815710325, + "learning_rate": 3.812161185540736e-05, + "loss": 0.5167, + "step": 547 + }, + { + "epoch": 1.7269866247049568, + "grad_norm": 0.21511450945343744, + "learning_rate": 3.811342155118829e-05, + "loss": 0.5192, + "step": 548 + }, + { + "epoch": 1.7301337529504326, + "grad_norm": 0.2194147912509461, + "learning_rate": 3.81052144171384e-05, + "loss": 0.5225, + "step": 549 + }, + { + "epoch": 1.7332808811959088, + "grad_norm": 0.22863514788872993, + "learning_rate": 3.809699046182972e-05, + "loss": 0.5081, + "step": 550 + }, + { + "epoch": 1.7364280094413846, + "grad_norm": 0.207709622035488, + "learning_rate": 3.808874969385184e-05, + "loss": 0.5089, + "step": 551 + }, + { + "epoch": 1.739575137686861, + "grad_norm": 0.21852716207508774, + "learning_rate": 3.808049212181192e-05, + "loss": 0.5198, + "step": 552 + }, + { + "epoch": 1.7427222659323367, + "grad_norm": 0.22298890603933133, + "learning_rate": 3.8072217754334655e-05, + "loss": 0.52, + "step": 553 + }, + { + "epoch": 1.7458693941778127, + "grad_norm": 0.2532878106195187, + "learning_rate": 3.8063926600062315e-05, + "loss": 0.5145, + "step": 554 + }, + { + "epoch": 1.7490165224232888, + "grad_norm": 0.24360819970385728, + "learning_rate": 3.805561866765467e-05, + "loss": 0.5141, + "step": 555 + }, + { + "epoch": 1.7521636506687648, + "grad_norm": 0.20861292145147736, + "learning_rate": 3.8047293965789025e-05, + "loss": 0.5196, + "step": 556 + }, + { + "epoch": 1.7553107789142408, + "grad_norm": 0.21285677902378522, + "learning_rate": 3.803895250316021e-05, + "loss": 0.5121, + "step": 557 + }, + { + "epoch": 1.7584579071597166, + "grad_norm": 0.2123709291037103, + "learning_rate": 3.803059428848054e-05, + "loss": 0.5176, + "step": 558 + }, + { + "epoch": 1.7616050354051929, + "grad_norm": 0.20536678541188122, + "learning_rate": 3.8022219330479854e-05, + "loss": 0.5209, + "step": 559 + }, + { + "epoch": 1.7647521636506687, + "grad_norm": 0.21563446035753572, + "learning_rate": 3.801382763790546e-05, + "loss": 0.5206, + "step": 560 + }, + { + "epoch": 1.767899291896145, + "grad_norm": 0.1986468939526788, + "learning_rate": 3.800541921952213e-05, + "loss": 0.5208, + "step": 561 + }, + { + "epoch": 1.7710464201416207, + "grad_norm": 0.1966987426971003, + "learning_rate": 3.799699408411215e-05, + "loss": 0.5073, + "step": 562 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 0.216109363155467, + "learning_rate": 3.7988552240475235e-05, + "loss": 0.5148, + "step": 563 + }, + { + "epoch": 1.7773406766325728, + "grad_norm": 0.1879949670400624, + "learning_rate": 3.7980093697428545e-05, + "loss": 0.5253, + "step": 564 + }, + { + "epoch": 1.7804878048780488, + "grad_norm": 0.21164171435059104, + "learning_rate": 3.797161846380669e-05, + "loss": 0.5131, + "step": 565 + }, + { + "epoch": 1.7836349331235248, + "grad_norm": 0.21236700030393402, + "learning_rate": 3.796312654846174e-05, + "loss": 0.5262, + "step": 566 + }, + { + "epoch": 1.7867820613690006, + "grad_norm": 0.23772908828608705, + "learning_rate": 3.795461796026314e-05, + "loss": 0.5161, + "step": 567 + }, + { + "epoch": 1.789929189614477, + "grad_norm": 0.1989690141679018, + "learning_rate": 3.794609270809779e-05, + "loss": 0.5148, + "step": 568 + }, + { + "epoch": 1.7930763178599527, + "grad_norm": 0.19386929212613396, + "learning_rate": 3.793755080086997e-05, + "loss": 0.5136, + "step": 569 + }, + { + "epoch": 1.796223446105429, + "grad_norm": 0.20873828711474318, + "learning_rate": 3.792899224750136e-05, + "loss": 0.5285, + "step": 570 + }, + { + "epoch": 1.7993705743509048, + "grad_norm": 0.21110002963411045, + "learning_rate": 3.7920417056931046e-05, + "loss": 0.5261, + "step": 571 + }, + { + "epoch": 1.8025177025963808, + "grad_norm": 0.1720427492785605, + "learning_rate": 3.791182523811545e-05, + "loss": 0.5144, + "step": 572 + }, + { + "epoch": 1.8056648308418568, + "grad_norm": 0.2174774642242585, + "learning_rate": 3.7903216800028416e-05, + "loss": 0.5106, + "step": 573 + }, + { + "epoch": 1.8088119590873328, + "grad_norm": 0.19514122415387664, + "learning_rate": 3.789459175166109e-05, + "loss": 0.5228, + "step": 574 + }, + { + "epoch": 1.8119590873328089, + "grad_norm": 0.18369904706820767, + "learning_rate": 3.7885950102022014e-05, + "loss": 0.5135, + "step": 575 + }, + { + "epoch": 1.8151062155782847, + "grad_norm": 0.20445427723961576, + "learning_rate": 3.787729186013704e-05, + "loss": 0.5148, + "step": 576 + }, + { + "epoch": 1.818253343823761, + "grad_norm": 0.1933783032598251, + "learning_rate": 3.786861703504936e-05, + "loss": 0.5215, + "step": 577 + }, + { + "epoch": 1.8214004720692367, + "grad_norm": 0.18342847430157735, + "learning_rate": 3.7859925635819476e-05, + "loss": 0.5128, + "step": 578 + }, + { + "epoch": 1.824547600314713, + "grad_norm": 0.2226340450308752, + "learning_rate": 3.785121767152523e-05, + "loss": 0.5283, + "step": 579 + }, + { + "epoch": 1.8276947285601888, + "grad_norm": 0.19369436637649629, + "learning_rate": 3.784249315126173e-05, + "loss": 0.5148, + "step": 580 + }, + { + "epoch": 1.8308418568056648, + "grad_norm": 0.19807163830925228, + "learning_rate": 3.783375208414139e-05, + "loss": 0.5151, + "step": 581 + }, + { + "epoch": 1.8339889850511408, + "grad_norm": 0.18328220410897705, + "learning_rate": 3.782499447929392e-05, + "loss": 0.514, + "step": 582 + }, + { + "epoch": 1.8371361132966169, + "grad_norm": 0.1839242669305662, + "learning_rate": 3.7816220345866294e-05, + "loss": 0.5251, + "step": 583 + }, + { + "epoch": 1.8402832415420929, + "grad_norm": 0.18986955880243364, + "learning_rate": 3.780742969302273e-05, + "loss": 0.5131, + "step": 584 + }, + { + "epoch": 1.8434303697875687, + "grad_norm": 0.19594785121809236, + "learning_rate": 3.7798622529944735e-05, + "loss": 0.5161, + "step": 585 + }, + { + "epoch": 1.846577498033045, + "grad_norm": 0.2357648214241525, + "learning_rate": 3.7789798865831024e-05, + "loss": 0.5156, + "step": 586 + }, + { + "epoch": 1.8497246262785207, + "grad_norm": 0.19304354227740758, + "learning_rate": 3.778095870989758e-05, + "loss": 0.5203, + "step": 587 + }, + { + "epoch": 1.852871754523997, + "grad_norm": 0.22728068274130966, + "learning_rate": 3.777210207137759e-05, + "loss": 0.5321, + "step": 588 + }, + { + "epoch": 1.8560188827694728, + "grad_norm": 0.2017775324752321, + "learning_rate": 3.7763228959521465e-05, + "loss": 0.5242, + "step": 589 + }, + { + "epoch": 1.8591660110149488, + "grad_norm": 0.22300635096362367, + "learning_rate": 3.775433938359681e-05, + "loss": 0.5231, + "step": 590 + }, + { + "epoch": 1.8623131392604249, + "grad_norm": 0.1996306525906858, + "learning_rate": 3.774543335288845e-05, + "loss": 0.5221, + "step": 591 + }, + { + "epoch": 1.8654602675059009, + "grad_norm": 0.16852847329609896, + "learning_rate": 3.773651087669837e-05, + "loss": 0.5107, + "step": 592 + }, + { + "epoch": 1.868607395751377, + "grad_norm": 0.24727169403577154, + "learning_rate": 3.7727571964345745e-05, + "loss": 0.522, + "step": 593 + }, + { + "epoch": 1.8717545239968527, + "grad_norm": 0.22616575051796836, + "learning_rate": 3.771861662516692e-05, + "loss": 0.5109, + "step": 594 + }, + { + "epoch": 1.874901652242329, + "grad_norm": 0.2500551252876427, + "learning_rate": 3.7709644868515386e-05, + "loss": 0.514, + "step": 595 + }, + { + "epoch": 1.8780487804878048, + "grad_norm": 0.19733979061423199, + "learning_rate": 3.770065670376179e-05, + "loss": 0.5138, + "step": 596 + }, + { + "epoch": 1.881195908733281, + "grad_norm": 0.23793266287535486, + "learning_rate": 3.769165214029392e-05, + "loss": 0.5151, + "step": 597 + }, + { + "epoch": 1.8843430369787568, + "grad_norm": 0.20047355737675948, + "learning_rate": 3.768263118751667e-05, + "loss": 0.5195, + "step": 598 + }, + { + "epoch": 1.8874901652242329, + "grad_norm": 0.1734693730368024, + "learning_rate": 3.767359385485208e-05, + "loss": 0.5067, + "step": 599 + }, + { + "epoch": 1.8906372934697089, + "grad_norm": 0.20206213476307608, + "learning_rate": 3.766454015173929e-05, + "loss": 0.5161, + "step": 600 + }, + { + "epoch": 1.893784421715185, + "grad_norm": 0.20882657364965593, + "learning_rate": 3.765547008763453e-05, + "loss": 0.5103, + "step": 601 + }, + { + "epoch": 1.896931549960661, + "grad_norm": 0.2021864503575807, + "learning_rate": 3.764638367201112e-05, + "loss": 0.5004, + "step": 602 + }, + { + "epoch": 1.9000786782061367, + "grad_norm": 0.20424529742189995, + "learning_rate": 3.763728091435946e-05, + "loss": 0.5162, + "step": 603 + }, + { + "epoch": 1.903225806451613, + "grad_norm": 0.1961209704262886, + "learning_rate": 3.7628161824187025e-05, + "loss": 0.518, + "step": 604 + }, + { + "epoch": 1.9063729346970888, + "grad_norm": 0.23767968238521123, + "learning_rate": 3.7619026411018345e-05, + "loss": 0.5069, + "step": 605 + }, + { + "epoch": 1.909520062942565, + "grad_norm": 0.21212885274385532, + "learning_rate": 3.7609874684394994e-05, + "loss": 0.519, + "step": 606 + }, + { + "epoch": 1.9126671911880408, + "grad_norm": 0.21268544914645238, + "learning_rate": 3.760070665387558e-05, + "loss": 0.5136, + "step": 607 + }, + { + "epoch": 1.9158143194335169, + "grad_norm": 0.20821784947192484, + "learning_rate": 3.7591522329035763e-05, + "loss": 0.5159, + "step": 608 + }, + { + "epoch": 1.918961447678993, + "grad_norm": 0.22337933592922737, + "learning_rate": 3.75823217194682e-05, + "loss": 0.519, + "step": 609 + }, + { + "epoch": 1.922108575924469, + "grad_norm": 0.25135156804926945, + "learning_rate": 3.7573104834782566e-05, + "loss": 0.5153, + "step": 610 + }, + { + "epoch": 1.925255704169945, + "grad_norm": 0.197509832149269, + "learning_rate": 3.756387168460552e-05, + "loss": 0.5247, + "step": 611 + }, + { + "epoch": 1.9284028324154208, + "grad_norm": 0.2389600063017852, + "learning_rate": 3.7554622278580735e-05, + "loss": 0.5166, + "step": 612 + }, + { + "epoch": 1.931549960660897, + "grad_norm": 0.26288745403046027, + "learning_rate": 3.754535662636884e-05, + "loss": 0.5236, + "step": 613 + }, + { + "epoch": 1.9346970889063728, + "grad_norm": 0.20996861395700098, + "learning_rate": 3.7536074737647455e-05, + "loss": 0.5168, + "step": 614 + }, + { + "epoch": 1.937844217151849, + "grad_norm": 0.23502763223514755, + "learning_rate": 3.752677662211114e-05, + "loss": 0.5185, + "step": 615 + }, + { + "epoch": 1.9409913453973249, + "grad_norm": 0.22436762734979818, + "learning_rate": 3.75174622894714e-05, + "loss": 0.5207, + "step": 616 + }, + { + "epoch": 1.944138473642801, + "grad_norm": 0.1810432051540689, + "learning_rate": 3.7508131749456696e-05, + "loss": 0.5161, + "step": 617 + }, + { + "epoch": 1.947285601888277, + "grad_norm": 0.2522520252638888, + "learning_rate": 3.74987850118124e-05, + "loss": 0.5112, + "step": 618 + }, + { + "epoch": 1.950432730133753, + "grad_norm": 0.25396670669216415, + "learning_rate": 3.748942208630082e-05, + "loss": 0.5221, + "step": 619 + }, + { + "epoch": 1.953579858379229, + "grad_norm": 0.22197793438383823, + "learning_rate": 3.748004298270115e-05, + "loss": 0.5162, + "step": 620 + }, + { + "epoch": 1.956726986624705, + "grad_norm": 0.23177715709488106, + "learning_rate": 3.74706477108095e-05, + "loss": 0.5106, + "step": 621 + }, + { + "epoch": 1.959874114870181, + "grad_norm": 0.22079140874889208, + "learning_rate": 3.746123628043886e-05, + "loss": 0.5193, + "step": 622 + }, + { + "epoch": 1.9630212431156568, + "grad_norm": 0.22902705592617217, + "learning_rate": 3.745180870141908e-05, + "loss": 0.5147, + "step": 623 + }, + { + "epoch": 1.966168371361133, + "grad_norm": 0.21218179122207526, + "learning_rate": 3.744236498359692e-05, + "loss": 0.5139, + "step": 624 + }, + { + "epoch": 1.969315499606609, + "grad_norm": 0.2289304429026494, + "learning_rate": 3.743290513683595e-05, + "loss": 0.509, + "step": 625 + }, + { + "epoch": 1.972462627852085, + "grad_norm": 0.19761688027384242, + "learning_rate": 3.742342917101661e-05, + "loss": 0.5108, + "step": 626 + }, + { + "epoch": 1.975609756097561, + "grad_norm": 0.20658238399745513, + "learning_rate": 3.741393709603617e-05, + "loss": 0.5162, + "step": 627 + }, + { + "epoch": 1.978756884343037, + "grad_norm": 0.22387644395974296, + "learning_rate": 3.740442892180873e-05, + "loss": 0.5176, + "step": 628 + }, + { + "epoch": 1.981904012588513, + "grad_norm": 0.2099407145968934, + "learning_rate": 3.7394904658265205e-05, + "loss": 0.5193, + "step": 629 + }, + { + "epoch": 1.985051140833989, + "grad_norm": 0.20355440734215408, + "learning_rate": 3.7385364315353305e-05, + "loss": 0.502, + "step": 630 + }, + { + "epoch": 1.988198269079465, + "grad_norm": 0.2737712591861954, + "learning_rate": 3.7375807903037534e-05, + "loss": 0.5146, + "step": 631 + }, + { + "epoch": 1.9913453973249409, + "grad_norm": 0.3107234957383597, + "learning_rate": 3.73662354312992e-05, + "loss": 0.5181, + "step": 632 + }, + { + "epoch": 1.9944925255704171, + "grad_norm": 0.24548807968691783, + "learning_rate": 3.735664691013636e-05, + "loss": 0.5078, + "step": 633 + }, + { + "epoch": 1.997639653815893, + "grad_norm": 0.21987770768265905, + "learning_rate": 3.734704234956385e-05, + "loss": 0.5089, + "step": 634 + }, + { + "epoch": 2.003147128245476, + "grad_norm": 0.5383301018870941, + "learning_rate": 3.7337421759613255e-05, + "loss": 1.0306, + "step": 635 + }, + { + "epoch": 2.006294256490952, + "grad_norm": 0.7404093702367391, + "learning_rate": 3.7327785150332896e-05, + "loss": 0.4838, + "step": 636 + }, + { + "epoch": 2.009441384736428, + "grad_norm": 0.7862043686171012, + "learning_rate": 3.7318132531787835e-05, + "loss": 0.481, + "step": 637 + }, + { + "epoch": 2.012588512981904, + "grad_norm": 0.5852793874262968, + "learning_rate": 3.7308463914059846e-05, + "loss": 0.4815, + "step": 638 + }, + { + "epoch": 2.01573564122738, + "grad_norm": 0.3672440684726176, + "learning_rate": 3.729877930724741e-05, + "loss": 0.4752, + "step": 639 + }, + { + "epoch": 2.018882769472856, + "grad_norm": 0.5159055161972755, + "learning_rate": 3.7289078721465735e-05, + "loss": 0.4769, + "step": 640 + }, + { + "epoch": 2.022029897718332, + "grad_norm": 0.397198576601526, + "learning_rate": 3.7279362166846677e-05, + "loss": 0.4794, + "step": 641 + }, + { + "epoch": 2.025177025963808, + "grad_norm": 0.42002417967064054, + "learning_rate": 3.726962965353881e-05, + "loss": 0.4833, + "step": 642 + }, + { + "epoch": 2.028324154209284, + "grad_norm": 0.44343130968400324, + "learning_rate": 3.725988119170735e-05, + "loss": 0.4759, + "step": 643 + }, + { + "epoch": 2.03147128245476, + "grad_norm": 0.3602187948121981, + "learning_rate": 3.725011679153418e-05, + "loss": 0.47, + "step": 644 + }, + { + "epoch": 2.034618410700236, + "grad_norm": 0.3689059788914393, + "learning_rate": 3.7240336463217824e-05, + "loss": 0.4845, + "step": 645 + }, + { + "epoch": 2.037765538945712, + "grad_norm": 0.34291838849435596, + "learning_rate": 3.723054021697346e-05, + "loss": 0.4788, + "step": 646 + }, + { + "epoch": 2.040912667191188, + "grad_norm": 0.34322337603866276, + "learning_rate": 3.722072806303287e-05, + "loss": 0.4714, + "step": 647 + }, + { + "epoch": 2.044059795436664, + "grad_norm": 0.3243197383293679, + "learning_rate": 3.721090001164447e-05, + "loss": 0.4784, + "step": 648 + }, + { + "epoch": 2.04720692368214, + "grad_norm": 0.28998221744555436, + "learning_rate": 3.720105607307326e-05, + "loss": 0.4787, + "step": 649 + }, + { + "epoch": 2.050354051927616, + "grad_norm": 0.3306107669906591, + "learning_rate": 3.7191196257600845e-05, + "loss": 0.475, + "step": 650 + }, + { + "epoch": 2.0535011801730922, + "grad_norm": 0.25616746795125434, + "learning_rate": 3.718132057552542e-05, + "loss": 0.4727, + "step": 651 + }, + { + "epoch": 2.056648308418568, + "grad_norm": 0.3110708535061029, + "learning_rate": 3.7171429037161735e-05, + "loss": 0.4815, + "step": 652 + }, + { + "epoch": 2.059795436664044, + "grad_norm": 0.27174269235855014, + "learning_rate": 3.7161521652841114e-05, + "loss": 0.4792, + "step": 653 + }, + { + "epoch": 2.06294256490952, + "grad_norm": 0.2751966115922098, + "learning_rate": 3.715159843291143e-05, + "loss": 0.4737, + "step": 654 + }, + { + "epoch": 2.066089693154996, + "grad_norm": 0.2793498805071301, + "learning_rate": 3.714165938773709e-05, + "loss": 0.4797, + "step": 655 + }, + { + "epoch": 2.069236821400472, + "grad_norm": 0.22565153747735522, + "learning_rate": 3.713170452769903e-05, + "loss": 0.4734, + "step": 656 + }, + { + "epoch": 2.072383949645948, + "grad_norm": 0.2703231691240761, + "learning_rate": 3.712173386319472e-05, + "loss": 0.4798, + "step": 657 + }, + { + "epoch": 2.075531077891424, + "grad_norm": 0.20941889089054108, + "learning_rate": 3.711174740463811e-05, + "loss": 0.4767, + "step": 658 + }, + { + "epoch": 2.0786782061369, + "grad_norm": 0.24127646716392062, + "learning_rate": 3.710174516245967e-05, + "loss": 0.4752, + "step": 659 + }, + { + "epoch": 2.0818253343823763, + "grad_norm": 0.22857637349920826, + "learning_rate": 3.7091727147106336e-05, + "loss": 0.4816, + "step": 660 + }, + { + "epoch": 2.084972462627852, + "grad_norm": 0.2403738146911501, + "learning_rate": 3.7081693369041544e-05, + "loss": 0.4802, + "step": 661 + }, + { + "epoch": 2.088119590873328, + "grad_norm": 0.2239762593172141, + "learning_rate": 3.707164383874516e-05, + "loss": 0.4729, + "step": 662 + }, + { + "epoch": 2.091266719118804, + "grad_norm": 0.2532472006395691, + "learning_rate": 3.706157856671353e-05, + "loss": 0.4775, + "step": 663 + }, + { + "epoch": 2.09441384736428, + "grad_norm": 0.2368186581534253, + "learning_rate": 3.7051497563459436e-05, + "loss": 0.4747, + "step": 664 + }, + { + "epoch": 2.097560975609756, + "grad_norm": 0.21848874983233801, + "learning_rate": 3.704140083951208e-05, + "loss": 0.4744, + "step": 665 + }, + { + "epoch": 2.100708103855232, + "grad_norm": 0.2589340916721653, + "learning_rate": 3.703128840541709e-05, + "loss": 0.4686, + "step": 666 + }, + { + "epoch": 2.1038552321007082, + "grad_norm": 0.1951064514885459, + "learning_rate": 3.7021160271736505e-05, + "loss": 0.4716, + "step": 667 + }, + { + "epoch": 2.107002360346184, + "grad_norm": 0.27598581262671057, + "learning_rate": 3.701101644904876e-05, + "loss": 0.474, + "step": 668 + }, + { + "epoch": 2.1101494885916603, + "grad_norm": 0.2065313958477124, + "learning_rate": 3.7000856947948676e-05, + "loss": 0.4715, + "step": 669 + }, + { + "epoch": 2.113296616837136, + "grad_norm": 0.23178764564943305, + "learning_rate": 3.699068177904745e-05, + "loss": 0.4806, + "step": 670 + }, + { + "epoch": 2.116443745082612, + "grad_norm": 0.20461181525036945, + "learning_rate": 3.698049095297265e-05, + "loss": 0.4748, + "step": 671 + }, + { + "epoch": 2.119590873328088, + "grad_norm": 0.2350705336847509, + "learning_rate": 3.697028448036817e-05, + "loss": 0.4729, + "step": 672 + }, + { + "epoch": 2.122738001573564, + "grad_norm": 0.20120939367328008, + "learning_rate": 3.696006237189429e-05, + "loss": 0.4786, + "step": 673 + }, + { + "epoch": 2.12588512981904, + "grad_norm": 0.22060728235722682, + "learning_rate": 3.6949824638227585e-05, + "loss": 0.4774, + "step": 674 + }, + { + "epoch": 2.129032258064516, + "grad_norm": 0.19177609563558462, + "learning_rate": 3.693957129006096e-05, + "loss": 0.484, + "step": 675 + }, + { + "epoch": 2.1321793863099923, + "grad_norm": 0.25855343543212544, + "learning_rate": 3.692930233810364e-05, + "loss": 0.4837, + "step": 676 + }, + { + "epoch": 2.135326514555468, + "grad_norm": 0.22818364530705276, + "learning_rate": 3.691901779308113e-05, + "loss": 0.4774, + "step": 677 + }, + { + "epoch": 2.1384736428009443, + "grad_norm": 0.20914360052852593, + "learning_rate": 3.690871766573523e-05, + "loss": 0.4728, + "step": 678 + }, + { + "epoch": 2.14162077104642, + "grad_norm": 0.25587656475568926, + "learning_rate": 3.6898401966824035e-05, + "loss": 0.4698, + "step": 679 + }, + { + "epoch": 2.144767899291896, + "grad_norm": 0.1826067226207546, + "learning_rate": 3.688807070712186e-05, + "loss": 0.4761, + "step": 680 + }, + { + "epoch": 2.147915027537372, + "grad_norm": 0.22738148878540357, + "learning_rate": 3.68777238974193e-05, + "loss": 0.4714, + "step": 681 + }, + { + "epoch": 2.151062155782848, + "grad_norm": 0.2111664202105763, + "learning_rate": 3.68673615485232e-05, + "loss": 0.4774, + "step": 682 + }, + { + "epoch": 2.1542092840283242, + "grad_norm": 0.24863885294296703, + "learning_rate": 3.685698367125662e-05, + "loss": 0.4743, + "step": 683 + }, + { + "epoch": 2.1573564122738, + "grad_norm": 0.21812972928565583, + "learning_rate": 3.684659027645884e-05, + "loss": 0.469, + "step": 684 + }, + { + "epoch": 2.1605035405192763, + "grad_norm": 0.24327955100377346, + "learning_rate": 3.683618137498535e-05, + "loss": 0.4781, + "step": 685 + }, + { + "epoch": 2.163650668764752, + "grad_norm": 0.2120560203500377, + "learning_rate": 3.6825756977707826e-05, + "loss": 0.4718, + "step": 686 + }, + { + "epoch": 2.1667977970102283, + "grad_norm": 0.24161382970696385, + "learning_rate": 3.6815317095514145e-05, + "loss": 0.4767, + "step": 687 + }, + { + "epoch": 2.169944925255704, + "grad_norm": 0.20329140472495538, + "learning_rate": 3.680486173930835e-05, + "loss": 0.4827, + "step": 688 + }, + { + "epoch": 2.17309205350118, + "grad_norm": 0.24761268049420182, + "learning_rate": 3.679439092001065e-05, + "loss": 0.4608, + "step": 689 + }, + { + "epoch": 2.176239181746656, + "grad_norm": 0.1871251156119415, + "learning_rate": 3.6783904648557396e-05, + "loss": 0.4695, + "step": 690 + }, + { + "epoch": 2.179386309992132, + "grad_norm": 0.2205720555772292, + "learning_rate": 3.67734029359011e-05, + "loss": 0.4717, + "step": 691 + }, + { + "epoch": 2.1825334382376083, + "grad_norm": 0.21386365254718184, + "learning_rate": 3.676288579301036e-05, + "loss": 0.4764, + "step": 692 + }, + { + "epoch": 2.185680566483084, + "grad_norm": 0.18868765682443855, + "learning_rate": 3.6752353230869925e-05, + "loss": 0.4698, + "step": 693 + }, + { + "epoch": 2.1888276947285603, + "grad_norm": 0.22008810612187404, + "learning_rate": 3.6741805260480644e-05, + "loss": 0.4713, + "step": 694 + }, + { + "epoch": 2.191974822974036, + "grad_norm": 0.20786113581001195, + "learning_rate": 3.673124189285945e-05, + "loss": 0.4806, + "step": 695 + }, + { + "epoch": 2.1951219512195124, + "grad_norm": 0.21496969539942087, + "learning_rate": 3.672066313903937e-05, + "loss": 0.4713, + "step": 696 + }, + { + "epoch": 2.198269079464988, + "grad_norm": 0.19763508819414857, + "learning_rate": 3.671006901006948e-05, + "loss": 0.4736, + "step": 697 + }, + { + "epoch": 2.201416207710464, + "grad_norm": 0.2293997083330246, + "learning_rate": 3.669945951701494e-05, + "loss": 0.4764, + "step": 698 + }, + { + "epoch": 2.2045633359559402, + "grad_norm": 0.22585825760183822, + "learning_rate": 3.668883467095694e-05, + "loss": 0.4734, + "step": 699 + }, + { + "epoch": 2.207710464201416, + "grad_norm": 0.20186256772917444, + "learning_rate": 3.6678194482992716e-05, + "loss": 0.4777, + "step": 700 + }, + { + "epoch": 2.2108575924468923, + "grad_norm": 0.19540930931295644, + "learning_rate": 3.666753896423551e-05, + "loss": 0.4846, + "step": 701 + }, + { + "epoch": 2.214004720692368, + "grad_norm": 0.21556486232945823, + "learning_rate": 3.6656868125814605e-05, + "loss": 0.4797, + "step": 702 + }, + { + "epoch": 2.2171518489378443, + "grad_norm": 0.19136441764044193, + "learning_rate": 3.664618197887526e-05, + "loss": 0.4722, + "step": 703 + }, + { + "epoch": 2.22029897718332, + "grad_norm": 0.24889924698530747, + "learning_rate": 3.663548053457873e-05, + "loss": 0.4824, + "step": 704 + }, + { + "epoch": 2.2234461054287964, + "grad_norm": 0.20658957990757543, + "learning_rate": 3.662476380410227e-05, + "loss": 0.4728, + "step": 705 + }, + { + "epoch": 2.226593233674272, + "grad_norm": 0.19277909826894754, + "learning_rate": 3.661403179863905e-05, + "loss": 0.4724, + "step": 706 + }, + { + "epoch": 2.229740361919748, + "grad_norm": 0.21254918935528136, + "learning_rate": 3.660328452939825e-05, + "loss": 0.4762, + "step": 707 + }, + { + "epoch": 2.2328874901652243, + "grad_norm": 0.2037212638248634, + "learning_rate": 3.659252200760495e-05, + "loss": 0.4609, + "step": 708 + }, + { + "epoch": 2.2360346184107, + "grad_norm": 0.17871023945609968, + "learning_rate": 3.658174424450019e-05, + "loss": 0.4748, + "step": 709 + }, + { + "epoch": 2.2391817466561763, + "grad_norm": 0.21443695153487202, + "learning_rate": 3.657095125134091e-05, + "loss": 0.4753, + "step": 710 + }, + { + "epoch": 2.242328874901652, + "grad_norm": 0.19069156080979188, + "learning_rate": 3.656014303939996e-05, + "loss": 0.4717, + "step": 711 + }, + { + "epoch": 2.2454760031471284, + "grad_norm": 0.2069028744399135, + "learning_rate": 3.654931961996611e-05, + "loss": 0.4783, + "step": 712 + }, + { + "epoch": 2.248623131392604, + "grad_norm": 0.18637707862361488, + "learning_rate": 3.653848100434397e-05, + "loss": 0.4832, + "step": 713 + }, + { + "epoch": 2.2517702596380804, + "grad_norm": 0.21210206014309427, + "learning_rate": 3.652762720385406e-05, + "loss": 0.4826, + "step": 714 + }, + { + "epoch": 2.2549173878835562, + "grad_norm": 0.18374915556791416, + "learning_rate": 3.651675822983273e-05, + "loss": 0.4728, + "step": 715 + }, + { + "epoch": 2.258064516129032, + "grad_norm": 0.21024735238705872, + "learning_rate": 3.65058740936322e-05, + "loss": 0.4706, + "step": 716 + }, + { + "epoch": 2.2612116443745083, + "grad_norm": 0.18392863904521575, + "learning_rate": 3.649497480662053e-05, + "loss": 0.4795, + "step": 717 + }, + { + "epoch": 2.264358772619984, + "grad_norm": 0.19781125359127497, + "learning_rate": 3.648406038018158e-05, + "loss": 0.4774, + "step": 718 + }, + { + "epoch": 2.2675059008654603, + "grad_norm": 0.21244857359184766, + "learning_rate": 3.6473130825715036e-05, + "loss": 0.4778, + "step": 719 + }, + { + "epoch": 2.270653029110936, + "grad_norm": 0.2240315261327545, + "learning_rate": 3.64621861546364e-05, + "loss": 0.4768, + "step": 720 + }, + { + "epoch": 2.2738001573564124, + "grad_norm": 0.19195015387576453, + "learning_rate": 3.645122637837693e-05, + "loss": 0.4761, + "step": 721 + }, + { + "epoch": 2.276947285601888, + "grad_norm": 0.23948727145219656, + "learning_rate": 3.644025150838368e-05, + "loss": 0.4843, + "step": 722 + }, + { + "epoch": 2.2800944138473644, + "grad_norm": 0.2044549094308154, + "learning_rate": 3.642926155611949e-05, + "loss": 0.4799, + "step": 723 + }, + { + "epoch": 2.2832415420928402, + "grad_norm": 0.1912737434372698, + "learning_rate": 3.64182565330629e-05, + "loss": 0.477, + "step": 724 + }, + { + "epoch": 2.286388670338316, + "grad_norm": 0.2225878785242701, + "learning_rate": 3.6407236450708235e-05, + "loss": 0.4659, + "step": 725 + }, + { + "epoch": 2.2895357985837923, + "grad_norm": 0.18432866393581965, + "learning_rate": 3.639620132056553e-05, + "loss": 0.4817, + "step": 726 + }, + { + "epoch": 2.292682926829268, + "grad_norm": 0.19897028209983136, + "learning_rate": 3.638515115416055e-05, + "loss": 0.4833, + "step": 727 + }, + { + "epoch": 2.2958300550747444, + "grad_norm": 0.20081135317562743, + "learning_rate": 3.637408596303476e-05, + "loss": 0.4704, + "step": 728 + }, + { + "epoch": 2.29897718332022, + "grad_norm": 0.1910907865185398, + "learning_rate": 3.63630057587453e-05, + "loss": 0.4825, + "step": 729 + }, + { + "epoch": 2.3021243115656964, + "grad_norm": 0.2079462815404506, + "learning_rate": 3.6351910552865e-05, + "loss": 0.4757, + "step": 730 + }, + { + "epoch": 2.305271439811172, + "grad_norm": 0.18921524052483815, + "learning_rate": 3.634080035698238e-05, + "loss": 0.4828, + "step": 731 + }, + { + "epoch": 2.3084185680566485, + "grad_norm": 0.19748083973385058, + "learning_rate": 3.632967518270159e-05, + "loss": 0.4747, + "step": 732 + }, + { + "epoch": 2.3115656963021243, + "grad_norm": 0.18371527979865251, + "learning_rate": 3.6318535041642434e-05, + "loss": 0.4787, + "step": 733 + }, + { + "epoch": 2.3147128245476, + "grad_norm": 0.1802590221169126, + "learning_rate": 3.630737994544036e-05, + "loss": 0.4771, + "step": 734 + }, + { + "epoch": 2.3178599527930763, + "grad_norm": 0.178911843071879, + "learning_rate": 3.6296209905746416e-05, + "loss": 0.4691, + "step": 735 + }, + { + "epoch": 2.321007081038552, + "grad_norm": 0.20053075969824302, + "learning_rate": 3.628502493422726e-05, + "loss": 0.4779, + "step": 736 + }, + { + "epoch": 2.3241542092840284, + "grad_norm": 0.17770666358757115, + "learning_rate": 3.627382504256516e-05, + "loss": 0.4771, + "step": 737 + }, + { + "epoch": 2.327301337529504, + "grad_norm": 0.19730337436045323, + "learning_rate": 3.626261024245795e-05, + "loss": 0.4707, + "step": 738 + }, + { + "epoch": 2.3304484657749804, + "grad_norm": 0.19412352087085247, + "learning_rate": 3.625138054561906e-05, + "loss": 0.4781, + "step": 739 + }, + { + "epoch": 2.3335955940204562, + "grad_norm": 0.17759746668876183, + "learning_rate": 3.6240135963777446e-05, + "loss": 0.4705, + "step": 740 + }, + { + "epoch": 2.3367427222659325, + "grad_norm": 0.2043623039312422, + "learning_rate": 3.622887650867765e-05, + "loss": 0.4684, + "step": 741 + }, + { + "epoch": 2.3398898505114083, + "grad_norm": 0.20845845270662547, + "learning_rate": 3.6217602192079706e-05, + "loss": 0.477, + "step": 742 + }, + { + "epoch": 2.343036978756884, + "grad_norm": 0.18687350571910952, + "learning_rate": 3.620631302575921e-05, + "loss": 0.4768, + "step": 743 + }, + { + "epoch": 2.3461841070023604, + "grad_norm": 0.21502507739077148, + "learning_rate": 3.619500902150723e-05, + "loss": 0.4772, + "step": 744 + }, + { + "epoch": 2.349331235247836, + "grad_norm": 0.18886284710156062, + "learning_rate": 3.6183690191130365e-05, + "loss": 0.4812, + "step": 745 + }, + { + "epoch": 2.3524783634933124, + "grad_norm": 0.17965446422580023, + "learning_rate": 3.617235654645068e-05, + "loss": 0.4774, + "step": 746 + }, + { + "epoch": 2.355625491738788, + "grad_norm": 0.20326170603684612, + "learning_rate": 3.616100809930572e-05, + "loss": 0.4768, + "step": 747 + }, + { + "epoch": 2.3587726199842645, + "grad_norm": 0.1882843118623647, + "learning_rate": 3.614964486154848e-05, + "loss": 0.4722, + "step": 748 + }, + { + "epoch": 2.3619197482297403, + "grad_norm": 0.17204335083123673, + "learning_rate": 3.613826684504743e-05, + "loss": 0.4674, + "step": 749 + }, + { + "epoch": 2.3650668764752165, + "grad_norm": 0.19139874893907688, + "learning_rate": 3.612687406168644e-05, + "loss": 0.4681, + "step": 750 + }, + { + "epoch": 2.3682140047206923, + "grad_norm": 0.1742655605287443, + "learning_rate": 3.611546652336482e-05, + "loss": 0.4735, + "step": 751 + }, + { + "epoch": 2.371361132966168, + "grad_norm": 0.1785986233463769, + "learning_rate": 3.610404424199732e-05, + "loss": 0.4725, + "step": 752 + }, + { + "epoch": 2.3745082612116444, + "grad_norm": 0.1770999597168329, + "learning_rate": 3.6092607229514026e-05, + "loss": 0.4751, + "step": 753 + }, + { + "epoch": 2.3776553894571206, + "grad_norm": 0.1861951910359395, + "learning_rate": 3.608115549786047e-05, + "loss": 0.4772, + "step": 754 + }, + { + "epoch": 2.3808025177025964, + "grad_norm": 0.1789049105676948, + "learning_rate": 3.6069689058997506e-05, + "loss": 0.4717, + "step": 755 + }, + { + "epoch": 2.3839496459480722, + "grad_norm": 0.18717193575284816, + "learning_rate": 3.60582079249014e-05, + "loss": 0.4742, + "step": 756 + }, + { + "epoch": 2.3870967741935485, + "grad_norm": 0.2319346851175833, + "learning_rate": 3.604671210756373e-05, + "loss": 0.48, + "step": 757 + }, + { + "epoch": 2.3902439024390243, + "grad_norm": 0.20723294874930143, + "learning_rate": 3.603520161899144e-05, + "loss": 0.4728, + "step": 758 + }, + { + "epoch": 2.3933910306845005, + "grad_norm": 0.21571506744754684, + "learning_rate": 3.6023676471206746e-05, + "loss": 0.4695, + "step": 759 + }, + { + "epoch": 2.3965381589299763, + "grad_norm": 0.17314280493311868, + "learning_rate": 3.601213667624724e-05, + "loss": 0.4735, + "step": 760 + }, + { + "epoch": 2.399685287175452, + "grad_norm": 0.21517245659461837, + "learning_rate": 3.600058224616576e-05, + "loss": 0.4805, + "step": 761 + }, + { + "epoch": 2.4028324154209284, + "grad_norm": 0.20430687151416146, + "learning_rate": 3.598901319303047e-05, + "loss": 0.4843, + "step": 762 + }, + { + "epoch": 2.4059795436664047, + "grad_norm": 0.2004177768069127, + "learning_rate": 3.597742952892477e-05, + "loss": 0.4833, + "step": 763 + }, + { + "epoch": 2.4091266719118805, + "grad_norm": 0.24567032007723139, + "learning_rate": 3.5965831265947344e-05, + "loss": 0.4686, + "step": 764 + }, + { + "epoch": 2.4122738001573563, + "grad_norm": 0.21956644771343653, + "learning_rate": 3.595421841621212e-05, + "loss": 0.478, + "step": 765 + }, + { + "epoch": 2.4154209284028325, + "grad_norm": 0.2038846900874555, + "learning_rate": 3.594259099184826e-05, + "loss": 0.4739, + "step": 766 + }, + { + "epoch": 2.4185680566483083, + "grad_norm": 0.21879240881473924, + "learning_rate": 3.593094900500015e-05, + "loss": 0.4713, + "step": 767 + }, + { + "epoch": 2.4217151848937846, + "grad_norm": 0.22973634489226144, + "learning_rate": 3.591929246782738e-05, + "loss": 0.4848, + "step": 768 + }, + { + "epoch": 2.4248623131392604, + "grad_norm": 0.19432937590163568, + "learning_rate": 3.5907621392504747e-05, + "loss": 0.4791, + "step": 769 + }, + { + "epoch": 2.4280094413847366, + "grad_norm": 0.19157056326864344, + "learning_rate": 3.589593579122222e-05, + "loss": 0.4801, + "step": 770 + }, + { + "epoch": 2.4311565696302124, + "grad_norm": 0.19492660523958835, + "learning_rate": 3.588423567618496e-05, + "loss": 0.4739, + "step": 771 + }, + { + "epoch": 2.4343036978756887, + "grad_norm": 0.19603103831816215, + "learning_rate": 3.5872521059613254e-05, + "loss": 0.4783, + "step": 772 + }, + { + "epoch": 2.4374508261211645, + "grad_norm": 0.16793141618091936, + "learning_rate": 3.5860791953742574e-05, + "loss": 0.4828, + "step": 773 + }, + { + "epoch": 2.4405979543666403, + "grad_norm": 0.18926327402558274, + "learning_rate": 3.5849048370823496e-05, + "loss": 0.462, + "step": 774 + }, + { + "epoch": 2.4437450826121165, + "grad_norm": 0.20901177449925584, + "learning_rate": 3.583729032312173e-05, + "loss": 0.4704, + "step": 775 + }, + { + "epoch": 2.4468922108575923, + "grad_norm": 0.21820920525411092, + "learning_rate": 3.582551782291809e-05, + "loss": 0.4661, + "step": 776 + }, + { + "epoch": 2.4500393391030686, + "grad_norm": 0.18520013651848868, + "learning_rate": 3.581373088250849e-05, + "loss": 0.4755, + "step": 777 + }, + { + "epoch": 2.4531864673485444, + "grad_norm": 0.2426404576575414, + "learning_rate": 3.580192951420391e-05, + "loss": 0.4723, + "step": 778 + }, + { + "epoch": 2.4563335955940206, + "grad_norm": 0.23249501540895848, + "learning_rate": 3.579011373033044e-05, + "loss": 0.4755, + "step": 779 + }, + { + "epoch": 2.4594807238394965, + "grad_norm": 0.2161001745094506, + "learning_rate": 3.577828354322917e-05, + "loss": 0.4773, + "step": 780 + }, + { + "epoch": 2.4626278520849727, + "grad_norm": 0.21146417784405747, + "learning_rate": 3.576643896525628e-05, + "loss": 0.4871, + "step": 781 + }, + { + "epoch": 2.4657749803304485, + "grad_norm": 0.20723676007259584, + "learning_rate": 3.575458000878294e-05, + "loss": 0.4783, + "step": 782 + }, + { + "epoch": 2.4689221085759243, + "grad_norm": 0.2584014129384574, + "learning_rate": 3.5742706686195386e-05, + "loss": 0.4767, + "step": 783 + }, + { + "epoch": 2.4720692368214006, + "grad_norm": 0.19393702218745548, + "learning_rate": 3.573081900989482e-05, + "loss": 0.4804, + "step": 784 + }, + { + "epoch": 2.4752163650668764, + "grad_norm": 0.2246269196846869, + "learning_rate": 3.5718916992297456e-05, + "loss": 0.4748, + "step": 785 + }, + { + "epoch": 2.4783634933123526, + "grad_norm": 0.2016712184696746, + "learning_rate": 3.5707000645834476e-05, + "loss": 0.4839, + "step": 786 + }, + { + "epoch": 2.4815106215578284, + "grad_norm": 0.20378549394086407, + "learning_rate": 3.569506998295203e-05, + "loss": 0.4726, + "step": 787 + }, + { + "epoch": 2.4846577498033047, + "grad_norm": 0.2337914377259457, + "learning_rate": 3.568312501611123e-05, + "loss": 0.4814, + "step": 788 + }, + { + "epoch": 2.4878048780487805, + "grad_norm": 0.19396608409990398, + "learning_rate": 3.5671165757788115e-05, + "loss": 0.4761, + "step": 789 + }, + { + "epoch": 2.4909520062942567, + "grad_norm": 0.2692755192747761, + "learning_rate": 3.5659192220473654e-05, + "loss": 0.4785, + "step": 790 + }, + { + "epoch": 2.4940991345397325, + "grad_norm": 0.18587733519714691, + "learning_rate": 3.5647204416673746e-05, + "loss": 0.4864, + "step": 791 + }, + { + "epoch": 2.4972462627852083, + "grad_norm": 0.2449196618260009, + "learning_rate": 3.5635202358909164e-05, + "loss": 0.4763, + "step": 792 + }, + { + "epoch": 2.5003933910306846, + "grad_norm": 0.22615305309932182, + "learning_rate": 3.562318605971559e-05, + "loss": 0.4851, + "step": 793 + }, + { + "epoch": 2.5035405192761604, + "grad_norm": 0.2043080610888049, + "learning_rate": 3.561115553164356e-05, + "loss": 0.4726, + "step": 794 + }, + { + "epoch": 2.5066876475216366, + "grad_norm": 0.22066196853846168, + "learning_rate": 3.55991107872585e-05, + "loss": 0.475, + "step": 795 + }, + { + "epoch": 2.5098347757671124, + "grad_norm": 0.17253963662301974, + "learning_rate": 3.558705183914066e-05, + "loss": 0.4734, + "step": 796 + }, + { + "epoch": 2.5129819040125883, + "grad_norm": 0.19881124746164847, + "learning_rate": 3.5574978699885134e-05, + "loss": 0.4832, + "step": 797 + }, + { + "epoch": 2.5161290322580645, + "grad_norm": 0.19723415337076033, + "learning_rate": 3.556289138210185e-05, + "loss": 0.4689, + "step": 798 + }, + { + "epoch": 2.5192761605035408, + "grad_norm": 0.18954283260561922, + "learning_rate": 3.555078989841551e-05, + "loss": 0.4757, + "step": 799 + }, + { + "epoch": 2.5224232887490166, + "grad_norm": 0.19352808470983424, + "learning_rate": 3.5538674261465655e-05, + "loss": 0.4713, + "step": 800 + }, + { + "epoch": 2.5255704169944924, + "grad_norm": 0.20041238629382177, + "learning_rate": 3.5526544483906575e-05, + "loss": 0.4845, + "step": 801 + }, + { + "epoch": 2.5287175452399686, + "grad_norm": 0.19181251462489346, + "learning_rate": 3.551440057840736e-05, + "loss": 0.4882, + "step": 802 + }, + { + "epoch": 2.5318646734854444, + "grad_norm": 0.18016461406070697, + "learning_rate": 3.5502242557651813e-05, + "loss": 0.4805, + "step": 803 + }, + { + "epoch": 2.5350118017309207, + "grad_norm": 0.20267911141580347, + "learning_rate": 3.5490070434338525e-05, + "loss": 0.4776, + "step": 804 + }, + { + "epoch": 2.5381589299763965, + "grad_norm": 0.19059012370650052, + "learning_rate": 3.5477884221180785e-05, + "loss": 0.4886, + "step": 805 + }, + { + "epoch": 2.5413060582218723, + "grad_norm": 0.21515842908040148, + "learning_rate": 3.546568393090662e-05, + "loss": 0.483, + "step": 806 + }, + { + "epoch": 2.5444531864673485, + "grad_norm": 0.19422697970653924, + "learning_rate": 3.5453469576258744e-05, + "loss": 0.4692, + "step": 807 + }, + { + "epoch": 2.5476003147128248, + "grad_norm": 0.209559004728807, + "learning_rate": 3.544124116999457e-05, + "loss": 0.4865, + "step": 808 + }, + { + "epoch": 2.5507474429583006, + "grad_norm": 0.1920441375389536, + "learning_rate": 3.542899872488618e-05, + "loss": 0.4793, + "step": 809 + }, + { + "epoch": 2.5538945712037764, + "grad_norm": 0.2319852120314673, + "learning_rate": 3.541674225372033e-05, + "loss": 0.4773, + "step": 810 + }, + { + "epoch": 2.5570416994492526, + "grad_norm": 0.19321156216806282, + "learning_rate": 3.540447176929841e-05, + "loss": 0.4757, + "step": 811 + }, + { + "epoch": 2.5601888276947284, + "grad_norm": 0.21514651398865584, + "learning_rate": 3.539218728443646e-05, + "loss": 0.4785, + "step": 812 + }, + { + "epoch": 2.5633359559402047, + "grad_norm": 0.21148869854656144, + "learning_rate": 3.537988881196514e-05, + "loss": 0.4746, + "step": 813 + }, + { + "epoch": 2.5664830841856805, + "grad_norm": 0.17595269634550698, + "learning_rate": 3.536757636472972e-05, + "loss": 0.4685, + "step": 814 + }, + { + "epoch": 2.5696302124311563, + "grad_norm": 0.20185229955902398, + "learning_rate": 3.5355249955590056e-05, + "loss": 0.4783, + "step": 815 + }, + { + "epoch": 2.5727773406766326, + "grad_norm": 0.17164696512537717, + "learning_rate": 3.53429095974206e-05, + "loss": 0.4775, + "step": 816 + }, + { + "epoch": 2.575924468922109, + "grad_norm": 0.23266847949769962, + "learning_rate": 3.533055530311036e-05, + "loss": 0.4692, + "step": 817 + }, + { + "epoch": 2.5790715971675846, + "grad_norm": 0.16306518710715495, + "learning_rate": 3.531818708556292e-05, + "loss": 0.4783, + "step": 818 + }, + { + "epoch": 2.5822187254130604, + "grad_norm": 0.20716704551095141, + "learning_rate": 3.530580495769638e-05, + "loss": 0.4785, + "step": 819 + }, + { + "epoch": 2.5853658536585367, + "grad_norm": 0.20742333714577207, + "learning_rate": 3.5293408932443384e-05, + "loss": 0.4795, + "step": 820 + }, + { + "epoch": 2.5885129819040125, + "grad_norm": 0.19428872955255258, + "learning_rate": 3.5280999022751095e-05, + "loss": 0.4853, + "step": 821 + }, + { + "epoch": 2.5916601101494887, + "grad_norm": 0.20147158146515537, + "learning_rate": 3.526857524158117e-05, + "loss": 0.468, + "step": 822 + }, + { + "epoch": 2.5948072383949645, + "grad_norm": 0.1893299986265306, + "learning_rate": 3.525613760190977e-05, + "loss": 0.4774, + "step": 823 + }, + { + "epoch": 2.5979543666404403, + "grad_norm": 0.17391513342893553, + "learning_rate": 3.524368611672749e-05, + "loss": 0.4698, + "step": 824 + }, + { + "epoch": 2.6011014948859166, + "grad_norm": 0.18505439269447876, + "learning_rate": 3.5231220799039434e-05, + "loss": 0.4759, + "step": 825 + }, + { + "epoch": 2.604248623131393, + "grad_norm": 0.18433327533924215, + "learning_rate": 3.521874166186512e-05, + "loss": 0.4745, + "step": 826 + }, + { + "epoch": 2.6073957513768686, + "grad_norm": 0.17820635418388936, + "learning_rate": 3.5206248718238525e-05, + "loss": 0.4862, + "step": 827 + }, + { + "epoch": 2.6105428796223444, + "grad_norm": 0.18696671678977475, + "learning_rate": 3.519374198120803e-05, + "loss": 0.4758, + "step": 828 + }, + { + "epoch": 2.6136900078678207, + "grad_norm": 0.18769968466549117, + "learning_rate": 3.5181221463836426e-05, + "loss": 0.4778, + "step": 829 + }, + { + "epoch": 2.6168371361132965, + "grad_norm": 0.191487891133253, + "learning_rate": 3.51686871792009e-05, + "loss": 0.4707, + "step": 830 + }, + { + "epoch": 2.6199842643587727, + "grad_norm": 0.1825542674839225, + "learning_rate": 3.5156139140393e-05, + "loss": 0.4706, + "step": 831 + }, + { + "epoch": 2.6231313926042485, + "grad_norm": 0.18855481959761283, + "learning_rate": 3.514357736051868e-05, + "loss": 0.4838, + "step": 832 + }, + { + "epoch": 2.6262785208497244, + "grad_norm": 0.18817841391423204, + "learning_rate": 3.513100185269821e-05, + "loss": 0.4685, + "step": 833 + }, + { + "epoch": 2.6294256490952006, + "grad_norm": 0.18928717937714, + "learning_rate": 3.51184126300662e-05, + "loss": 0.4781, + "step": 834 + }, + { + "epoch": 2.632572777340677, + "grad_norm": 0.2256933473661262, + "learning_rate": 3.510580970577161e-05, + "loss": 0.4739, + "step": 835 + }, + { + "epoch": 2.6357199055861527, + "grad_norm": 0.17564318088263306, + "learning_rate": 3.5093193092977694e-05, + "loss": 0.4718, + "step": 836 + }, + { + "epoch": 2.6388670338316285, + "grad_norm": 0.20104156818385482, + "learning_rate": 3.5080562804861996e-05, + "loss": 0.4802, + "step": 837 + }, + { + "epoch": 2.6420141620771047, + "grad_norm": 0.18928735051422238, + "learning_rate": 3.506791885461636e-05, + "loss": 0.4799, + "step": 838 + }, + { + "epoch": 2.6451612903225805, + "grad_norm": 0.17754299417673466, + "learning_rate": 3.505526125544688e-05, + "loss": 0.4739, + "step": 839 + }, + { + "epoch": 2.6483084185680568, + "grad_norm": 0.17924816010567865, + "learning_rate": 3.504259002057394e-05, + "loss": 0.4833, + "step": 840 + }, + { + "epoch": 2.6514555468135326, + "grad_norm": 0.18330420763324123, + "learning_rate": 3.5029905163232114e-05, + "loss": 0.4809, + "step": 841 + }, + { + "epoch": 2.654602675059009, + "grad_norm": 0.17763845173295095, + "learning_rate": 3.501720669667025e-05, + "loss": 0.478, + "step": 842 + }, + { + "epoch": 2.6577498033044846, + "grad_norm": 0.20606096391374554, + "learning_rate": 3.500449463415139e-05, + "loss": 0.4803, + "step": 843 + }, + { + "epoch": 2.660896931549961, + "grad_norm": 0.16091387572699684, + "learning_rate": 3.4991768988952794e-05, + "loss": 0.4777, + "step": 844 + }, + { + "epoch": 2.6640440597954367, + "grad_norm": 0.17951419173964245, + "learning_rate": 3.497902977436587e-05, + "loss": 0.4786, + "step": 845 + }, + { + "epoch": 2.6671911880409125, + "grad_norm": 0.17938289165750493, + "learning_rate": 3.4966277003696236e-05, + "loss": 0.4818, + "step": 846 + }, + { + "epoch": 2.6703383162863887, + "grad_norm": 0.18410261157957933, + "learning_rate": 3.495351069026365e-05, + "loss": 0.4738, + "step": 847 + }, + { + "epoch": 2.6734854445318645, + "grad_norm": 0.20590921125222097, + "learning_rate": 3.494073084740204e-05, + "loss": 0.486, + "step": 848 + }, + { + "epoch": 2.676632572777341, + "grad_norm": 0.20239376639918705, + "learning_rate": 3.492793748845942e-05, + "loss": 0.4782, + "step": 849 + }, + { + "epoch": 2.6797797010228166, + "grad_norm": 0.1916725798858272, + "learning_rate": 3.491513062679796e-05, + "loss": 0.47, + "step": 850 + }, + { + "epoch": 2.682926829268293, + "grad_norm": 0.2033931578311126, + "learning_rate": 3.490231027579393e-05, + "loss": 0.4791, + "step": 851 + }, + { + "epoch": 2.6860739575137687, + "grad_norm": 0.1941921749860495, + "learning_rate": 3.4889476448837656e-05, + "loss": 0.4882, + "step": 852 + }, + { + "epoch": 2.689221085759245, + "grad_norm": 0.19874836532319543, + "learning_rate": 3.4876629159333575e-05, + "loss": 0.4756, + "step": 853 + }, + { + "epoch": 2.6923682140047207, + "grad_norm": 0.20304633766767777, + "learning_rate": 3.486376842070017e-05, + "loss": 0.4793, + "step": 854 + }, + { + "epoch": 2.6955153422501965, + "grad_norm": 0.18295353981827106, + "learning_rate": 3.485089424636997e-05, + "loss": 0.4822, + "step": 855 + }, + { + "epoch": 2.6986624704956728, + "grad_norm": 0.2050344464266888, + "learning_rate": 3.4838006649789546e-05, + "loss": 0.4711, + "step": 856 + }, + { + "epoch": 2.7018095987411486, + "grad_norm": 0.17837345576674332, + "learning_rate": 3.482510564441949e-05, + "loss": 0.4835, + "step": 857 + }, + { + "epoch": 2.704956726986625, + "grad_norm": 0.22822296718181742, + "learning_rate": 3.4812191243734375e-05, + "loss": 0.4762, + "step": 858 + }, + { + "epoch": 2.7081038552321006, + "grad_norm": 0.1759543201588798, + "learning_rate": 3.479926346122279e-05, + "loss": 0.4738, + "step": 859 + }, + { + "epoch": 2.711250983477577, + "grad_norm": 0.2319777833974616, + "learning_rate": 3.478632231038729e-05, + "loss": 0.4794, + "step": 860 + }, + { + "epoch": 2.7143981117230527, + "grad_norm": 0.17035781259506136, + "learning_rate": 3.477336780474439e-05, + "loss": 0.4769, + "step": 861 + }, + { + "epoch": 2.717545239968529, + "grad_norm": 0.21472875867046778, + "learning_rate": 3.4760399957824576e-05, + "loss": 0.4818, + "step": 862 + }, + { + "epoch": 2.7206923682140047, + "grad_norm": 0.19547886779373266, + "learning_rate": 3.474741878317223e-05, + "loss": 0.4756, + "step": 863 + }, + { + "epoch": 2.7238394964594805, + "grad_norm": 0.1896076340759897, + "learning_rate": 3.4734424294345673e-05, + "loss": 0.4826, + "step": 864 + }, + { + "epoch": 2.726986624704957, + "grad_norm": 0.2005371691635383, + "learning_rate": 3.472141650491716e-05, + "loss": 0.4898, + "step": 865 + }, + { + "epoch": 2.7301337529504326, + "grad_norm": 0.2152647700389423, + "learning_rate": 3.470839542847279e-05, + "loss": 0.4816, + "step": 866 + }, + { + "epoch": 2.733280881195909, + "grad_norm": 0.2073082020273619, + "learning_rate": 3.4695361078612565e-05, + "loss": 0.4766, + "step": 867 + }, + { + "epoch": 2.7364280094413846, + "grad_norm": 0.22650494077281716, + "learning_rate": 3.468231346895035e-05, + "loss": 0.4773, + "step": 868 + }, + { + "epoch": 2.739575137686861, + "grad_norm": 0.18847447202117246, + "learning_rate": 3.466925261311386e-05, + "loss": 0.4757, + "step": 869 + }, + { + "epoch": 2.7427222659323367, + "grad_norm": 0.19717018165808387, + "learning_rate": 3.4656178524744644e-05, + "loss": 0.4723, + "step": 870 + }, + { + "epoch": 2.745869394177813, + "grad_norm": 0.20419628183572036, + "learning_rate": 3.464309121749805e-05, + "loss": 0.4685, + "step": 871 + }, + { + "epoch": 2.7490165224232888, + "grad_norm": 0.20942542946004844, + "learning_rate": 3.4629990705043274e-05, + "loss": 0.4807, + "step": 872 + }, + { + "epoch": 2.7521636506687646, + "grad_norm": 0.19751423169250182, + "learning_rate": 3.461687700106327e-05, + "loss": 0.478, + "step": 873 + }, + { + "epoch": 2.755310778914241, + "grad_norm": 0.20839382920660143, + "learning_rate": 3.46037501192548e-05, + "loss": 0.4796, + "step": 874 + }, + { + "epoch": 2.7584579071597166, + "grad_norm": 0.21264262241811938, + "learning_rate": 3.459061007332835e-05, + "loss": 0.483, + "step": 875 + }, + { + "epoch": 2.761605035405193, + "grad_norm": 0.21206865795633334, + "learning_rate": 3.457745687700818e-05, + "loss": 0.482, + "step": 876 + }, + { + "epoch": 2.7647521636506687, + "grad_norm": 0.2267998478365036, + "learning_rate": 3.4564290544032304e-05, + "loss": 0.4852, + "step": 877 + }, + { + "epoch": 2.767899291896145, + "grad_norm": 0.19962244872664645, + "learning_rate": 3.455111108815242e-05, + "loss": 0.4781, + "step": 878 + }, + { + "epoch": 2.7710464201416207, + "grad_norm": 0.20223940173326052, + "learning_rate": 3.453791852313395e-05, + "loss": 0.4815, + "step": 879 + }, + { + "epoch": 2.774193548387097, + "grad_norm": 0.2497760291214373, + "learning_rate": 3.4524712862756004e-05, + "loss": 0.4737, + "step": 880 + }, + { + "epoch": 2.777340676632573, + "grad_norm": 0.21392418161927715, + "learning_rate": 3.451149412081137e-05, + "loss": 0.4849, + "step": 881 + }, + { + "epoch": 2.7804878048780486, + "grad_norm": 0.20756513406871988, + "learning_rate": 3.4498262311106505e-05, + "loss": 0.4794, + "step": 882 + }, + { + "epoch": 2.783634933123525, + "grad_norm": 0.19434508478386714, + "learning_rate": 3.448501744746151e-05, + "loss": 0.4717, + "step": 883 + }, + { + "epoch": 2.7867820613690006, + "grad_norm": 0.17815978736962265, + "learning_rate": 3.4471759543710115e-05, + "loss": 0.479, + "step": 884 + }, + { + "epoch": 2.789929189614477, + "grad_norm": 0.21692701459569838, + "learning_rate": 3.4458488613699686e-05, + "loss": 0.4711, + "step": 885 + }, + { + "epoch": 2.7930763178599527, + "grad_norm": 0.1873399635626309, + "learning_rate": 3.444520467129118e-05, + "loss": 0.484, + "step": 886 + }, + { + "epoch": 2.796223446105429, + "grad_norm": 0.1976319148693629, + "learning_rate": 3.4431907730359137e-05, + "loss": 0.4777, + "step": 887 + }, + { + "epoch": 2.7993705743509048, + "grad_norm": 0.19261763199922333, + "learning_rate": 3.44185978047917e-05, + "loss": 0.4658, + "step": 888 + }, + { + "epoch": 2.802517702596381, + "grad_norm": 0.20902878866014565, + "learning_rate": 3.440527490849055e-05, + "loss": 0.4751, + "step": 889 + }, + { + "epoch": 2.805664830841857, + "grad_norm": 0.196677203826585, + "learning_rate": 3.439193905537094e-05, + "loss": 0.4739, + "step": 890 + }, + { + "epoch": 2.8088119590873326, + "grad_norm": 0.18777554851103495, + "learning_rate": 3.4378590259361626e-05, + "loss": 0.471, + "step": 891 + }, + { + "epoch": 2.811959087332809, + "grad_norm": 0.2206571953754081, + "learning_rate": 3.4365228534404895e-05, + "loss": 0.479, + "step": 892 + }, + { + "epoch": 2.8151062155782847, + "grad_norm": 0.16662192562786868, + "learning_rate": 3.435185389445655e-05, + "loss": 0.4745, + "step": 893 + }, + { + "epoch": 2.818253343823761, + "grad_norm": 0.19851273382341908, + "learning_rate": 3.433846635348587e-05, + "loss": 0.4773, + "step": 894 + }, + { + "epoch": 2.8214004720692367, + "grad_norm": 0.19549393874279214, + "learning_rate": 3.43250659254756e-05, + "loss": 0.4683, + "step": 895 + }, + { + "epoch": 2.824547600314713, + "grad_norm": 0.16826088149239804, + "learning_rate": 3.4311652624421976e-05, + "loss": 0.48, + "step": 896 + }, + { + "epoch": 2.8276947285601888, + "grad_norm": 0.1762752510049042, + "learning_rate": 3.429822646433464e-05, + "loss": 0.479, + "step": 897 + }, + { + "epoch": 2.830841856805665, + "grad_norm": 0.18678077326219097, + "learning_rate": 3.4284787459236705e-05, + "loss": 0.4723, + "step": 898 + }, + { + "epoch": 2.833988985051141, + "grad_norm": 0.1630882015779384, + "learning_rate": 3.427133562316466e-05, + "loss": 0.4782, + "step": 899 + }, + { + "epoch": 2.8371361132966166, + "grad_norm": 0.17882705777771293, + "learning_rate": 3.425787097016843e-05, + "loss": 0.4714, + "step": 900 + }, + { + "epoch": 2.840283241542093, + "grad_norm": 0.17187562653472938, + "learning_rate": 3.424439351431131e-05, + "loss": 0.4742, + "step": 901 + }, + { + "epoch": 2.8434303697875687, + "grad_norm": 0.174319821594161, + "learning_rate": 3.423090326966996e-05, + "loss": 0.4823, + "step": 902 + }, + { + "epoch": 2.846577498033045, + "grad_norm": 0.1690838895842069, + "learning_rate": 3.4217400250334416e-05, + "loss": 0.4773, + "step": 903 + }, + { + "epoch": 2.8497246262785207, + "grad_norm": 0.1731044457799335, + "learning_rate": 3.420388447040804e-05, + "loss": 0.4684, + "step": 904 + }, + { + "epoch": 2.852871754523997, + "grad_norm": 0.177357699597428, + "learning_rate": 3.419035594400753e-05, + "loss": 0.477, + "step": 905 + }, + { + "epoch": 2.856018882769473, + "grad_norm": 0.1827484820724141, + "learning_rate": 3.41768146852629e-05, + "loss": 0.4791, + "step": 906 + }, + { + "epoch": 2.859166011014949, + "grad_norm": 0.20812185117779652, + "learning_rate": 3.416326070831746e-05, + "loss": 0.4818, + "step": 907 + }, + { + "epoch": 2.862313139260425, + "grad_norm": 0.19582434700048504, + "learning_rate": 3.414969402732779e-05, + "loss": 0.4736, + "step": 908 + }, + { + "epoch": 2.8654602675059007, + "grad_norm": 0.17405726221555853, + "learning_rate": 3.4136114656463766e-05, + "loss": 0.4822, + "step": 909 + }, + { + "epoch": 2.868607395751377, + "grad_norm": 0.1886351350028885, + "learning_rate": 3.4122522609908504e-05, + "loss": 0.4799, + "step": 910 + }, + { + "epoch": 2.8717545239968527, + "grad_norm": 0.17300845271944784, + "learning_rate": 3.410891790185834e-05, + "loss": 0.4737, + "step": 911 + }, + { + "epoch": 2.874901652242329, + "grad_norm": 0.17440320919074534, + "learning_rate": 3.409530054652287e-05, + "loss": 0.4731, + "step": 912 + }, + { + "epoch": 2.8780487804878048, + "grad_norm": 0.1803068800569239, + "learning_rate": 3.408167055812488e-05, + "loss": 0.4769, + "step": 913 + }, + { + "epoch": 2.881195908733281, + "grad_norm": 0.181432403586137, + "learning_rate": 3.406802795090034e-05, + "loss": 0.4915, + "step": 914 + }, + { + "epoch": 2.884343036978757, + "grad_norm": 0.16179024096943073, + "learning_rate": 3.405437273909843e-05, + "loss": 0.4795, + "step": 915 + }, + { + "epoch": 2.887490165224233, + "grad_norm": 0.17972825825989644, + "learning_rate": 3.4040704936981475e-05, + "loss": 0.4761, + "step": 916 + }, + { + "epoch": 2.890637293469709, + "grad_norm": 0.17303120146453108, + "learning_rate": 3.4027024558824956e-05, + "loss": 0.4737, + "step": 917 + }, + { + "epoch": 2.8937844217151847, + "grad_norm": 0.19260008079093655, + "learning_rate": 3.401333161891747e-05, + "loss": 0.4827, + "step": 918 + }, + { + "epoch": 2.896931549960661, + "grad_norm": 0.19408527896636127, + "learning_rate": 3.3999626131560754e-05, + "loss": 0.4791, + "step": 919 + }, + { + "epoch": 2.9000786782061367, + "grad_norm": 0.17709492241260272, + "learning_rate": 3.398590811106966e-05, + "loss": 0.4758, + "step": 920 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 0.24719908496023188, + "learning_rate": 3.397217757177211e-05, + "loss": 0.478, + "step": 921 + }, + { + "epoch": 2.906372934697089, + "grad_norm": 0.21088224277568318, + "learning_rate": 3.395843452800912e-05, + "loss": 0.4677, + "step": 922 + }, + { + "epoch": 2.909520062942565, + "grad_norm": 0.18357158178528057, + "learning_rate": 3.394467899413473e-05, + "loss": 0.4822, + "step": 923 + }, + { + "epoch": 2.912667191188041, + "grad_norm": 0.20061925910804337, + "learning_rate": 3.393091098451607e-05, + "loss": 0.4796, + "step": 924 + }, + { + "epoch": 2.915814319433517, + "grad_norm": 0.257783364662756, + "learning_rate": 3.391713051353328e-05, + "loss": 0.4823, + "step": 925 + }, + { + "epoch": 2.918961447678993, + "grad_norm": 0.20566149454081753, + "learning_rate": 3.39033375955795e-05, + "loss": 0.4806, + "step": 926 + }, + { + "epoch": 2.9221085759244687, + "grad_norm": 0.21739345983865224, + "learning_rate": 3.388953224506091e-05, + "loss": 0.479, + "step": 927 + }, + { + "epoch": 2.925255704169945, + "grad_norm": 0.2019066969178866, + "learning_rate": 3.3875714476396635e-05, + "loss": 0.4791, + "step": 928 + }, + { + "epoch": 2.9284028324154208, + "grad_norm": 0.18964152674565116, + "learning_rate": 3.38618843040188e-05, + "loss": 0.4872, + "step": 929 + }, + { + "epoch": 2.931549960660897, + "grad_norm": 0.21572408114004044, + "learning_rate": 3.384804174237246e-05, + "loss": 0.4856, + "step": 930 + }, + { + "epoch": 2.934697088906373, + "grad_norm": 0.20435168904218748, + "learning_rate": 3.3834186805915634e-05, + "loss": 0.4823, + "step": 931 + }, + { + "epoch": 2.937844217151849, + "grad_norm": 0.23036698086458443, + "learning_rate": 3.382031950911925e-05, + "loss": 0.4842, + "step": 932 + }, + { + "epoch": 2.940991345397325, + "grad_norm": 0.2559922205766231, + "learning_rate": 3.380643986646714e-05, + "loss": 0.4683, + "step": 933 + }, + { + "epoch": 2.944138473642801, + "grad_norm": 0.20816226293770818, + "learning_rate": 3.3792547892456045e-05, + "loss": 0.478, + "step": 934 + }, + { + "epoch": 2.947285601888277, + "grad_norm": 0.27147963925238433, + "learning_rate": 3.37786436015956e-05, + "loss": 0.4716, + "step": 935 + }, + { + "epoch": 2.9504327301337527, + "grad_norm": 0.1917348894021088, + "learning_rate": 3.376472700840827e-05, + "loss": 0.4855, + "step": 936 + }, + { + "epoch": 2.953579858379229, + "grad_norm": 0.26586313382080357, + "learning_rate": 3.375079812742939e-05, + "loss": 0.4751, + "step": 937 + }, + { + "epoch": 2.9567269866247052, + "grad_norm": 0.22006766096753588, + "learning_rate": 3.373685697320713e-05, + "loss": 0.4777, + "step": 938 + }, + { + "epoch": 2.959874114870181, + "grad_norm": 0.22686294528313417, + "learning_rate": 3.372290356030246e-05, + "loss": 0.4788, + "step": 939 + }, + { + "epoch": 2.963021243115657, + "grad_norm": 0.19665794947101586, + "learning_rate": 3.370893790328917e-05, + "loss": 0.4904, + "step": 940 + }, + { + "epoch": 2.966168371361133, + "grad_norm": 0.21016583933505117, + "learning_rate": 3.369496001675385e-05, + "loss": 0.4846, + "step": 941 + }, + { + "epoch": 2.969315499606609, + "grad_norm": 0.1823767869897883, + "learning_rate": 3.368096991529583e-05, + "loss": 0.474, + "step": 942 + }, + { + "epoch": 2.972462627852085, + "grad_norm": 0.19889515201753574, + "learning_rate": 3.366696761352723e-05, + "loss": 0.4744, + "step": 943 + }, + { + "epoch": 2.975609756097561, + "grad_norm": 0.19883183201167187, + "learning_rate": 3.36529531260729e-05, + "loss": 0.4872, + "step": 944 + }, + { + "epoch": 2.9787568843430368, + "grad_norm": 0.17416758317261993, + "learning_rate": 3.363892646757041e-05, + "loss": 0.4791, + "step": 945 + }, + { + "epoch": 2.981904012588513, + "grad_norm": 0.1922592641536021, + "learning_rate": 3.362488765267006e-05, + "loss": 0.4815, + "step": 946 + }, + { + "epoch": 2.9850511408339893, + "grad_norm": 0.17209027351228018, + "learning_rate": 3.361083669603482e-05, + "loss": 0.4796, + "step": 947 + }, + { + "epoch": 2.988198269079465, + "grad_norm": 0.17084312866058127, + "learning_rate": 3.3596773612340375e-05, + "loss": 0.4805, + "step": 948 + }, + { + "epoch": 2.991345397324941, + "grad_norm": 0.16455840604865776, + "learning_rate": 3.358269841627504e-05, + "loss": 0.4734, + "step": 949 + }, + { + "epoch": 2.994492525570417, + "grad_norm": 0.17609503259030376, + "learning_rate": 3.356861112253982e-05, + "loss": 0.4813, + "step": 950 + }, + { + "epoch": 2.997639653815893, + "grad_norm": 0.16815201729549342, + "learning_rate": 3.355451174584834e-05, + "loss": 0.477, + "step": 951 + }, + { + "epoch": 3.003147128245476, + "grad_norm": 0.4088775165911742, + "learning_rate": 3.35404003009268e-05, + "loss": 0.9, + "step": 952 + }, + { + "epoch": 3.006294256490952, + "grad_norm": 0.32652509762470977, + "learning_rate": 3.352627680251409e-05, + "loss": 0.4413, + "step": 953 + }, + { + "epoch": 3.009441384736428, + "grad_norm": 0.31181526619361466, + "learning_rate": 3.3512141265361625e-05, + "loss": 0.4442, + "step": 954 + }, + { + "epoch": 3.012588512981904, + "grad_norm": 0.28249452749334586, + "learning_rate": 3.3497993704233415e-05, + "loss": 0.4341, + "step": 955 + }, + { + "epoch": 3.01573564122738, + "grad_norm": 0.2469014492840724, + "learning_rate": 3.348383413390603e-05, + "loss": 0.4357, + "step": 956 + }, + { + "epoch": 3.018882769472856, + "grad_norm": 0.29518171306893987, + "learning_rate": 3.346966256916858e-05, + "loss": 0.4331, + "step": 957 + }, + { + "epoch": 3.022029897718332, + "grad_norm": 0.24550117952865097, + "learning_rate": 3.345547902482271e-05, + "loss": 0.4328, + "step": 958 + }, + { + "epoch": 3.025177025963808, + "grad_norm": 0.26322265876569234, + "learning_rate": 3.344128351568255e-05, + "loss": 0.4296, + "step": 959 + }, + { + "epoch": 3.028324154209284, + "grad_norm": 0.3355141854171973, + "learning_rate": 3.3427076056574765e-05, + "loss": 0.4399, + "step": 960 + }, + { + "epoch": 3.03147128245476, + "grad_norm": 0.24848144520611637, + "learning_rate": 3.341285666233849e-05, + "loss": 0.4379, + "step": 961 + }, + { + "epoch": 3.034618410700236, + "grad_norm": 0.3051743839260837, + "learning_rate": 3.3398625347825295e-05, + "loss": 0.4321, + "step": 962 + }, + { + "epoch": 3.037765538945712, + "grad_norm": 0.2520446263992142, + "learning_rate": 3.3384382127899254e-05, + "loss": 0.4326, + "step": 963 + }, + { + "epoch": 3.040912667191188, + "grad_norm": 0.23491369763876913, + "learning_rate": 3.337012701743682e-05, + "loss": 0.4304, + "step": 964 + }, + { + "epoch": 3.044059795436664, + "grad_norm": 0.22739503178054998, + "learning_rate": 3.33558600313269e-05, + "loss": 0.4316, + "step": 965 + }, + { + "epoch": 3.04720692368214, + "grad_norm": 0.23290557210702711, + "learning_rate": 3.334158118447081e-05, + "loss": 0.4205, + "step": 966 + }, + { + "epoch": 3.050354051927616, + "grad_norm": 0.23718213519938236, + "learning_rate": 3.3327290491782214e-05, + "loss": 0.4276, + "step": 967 + }, + { + "epoch": 3.0535011801730922, + "grad_norm": 0.22184226524551506, + "learning_rate": 3.331298796818719e-05, + "loss": 0.4336, + "step": 968 + }, + { + "epoch": 3.056648308418568, + "grad_norm": 0.24883677057218126, + "learning_rate": 3.329867362862416e-05, + "loss": 0.4202, + "step": 969 + }, + { + "epoch": 3.059795436664044, + "grad_norm": 0.20245231032426314, + "learning_rate": 3.328434748804389e-05, + "loss": 0.4283, + "step": 970 + }, + { + "epoch": 3.06294256490952, + "grad_norm": 0.22154485320162687, + "learning_rate": 3.327000956140944e-05, + "loss": 0.4276, + "step": 971 + }, + { + "epoch": 3.066089693154996, + "grad_norm": 0.23045678598682195, + "learning_rate": 3.325565986369624e-05, + "loss": 0.438, + "step": 972 + }, + { + "epoch": 3.069236821400472, + "grad_norm": 0.21978195581485033, + "learning_rate": 3.3241298409891967e-05, + "loss": 0.4347, + "step": 973 + }, + { + "epoch": 3.072383949645948, + "grad_norm": 0.2459169417158989, + "learning_rate": 3.3226925214996586e-05, + "loss": 0.426, + "step": 974 + }, + { + "epoch": 3.075531077891424, + "grad_norm": 0.2087661039669764, + "learning_rate": 3.3212540294022324e-05, + "loss": 0.424, + "step": 975 + }, + { + "epoch": 3.0786782061369, + "grad_norm": 0.22053026125524922, + "learning_rate": 3.319814366199368e-05, + "loss": 0.4358, + "step": 976 + }, + { + "epoch": 3.0818253343823763, + "grad_norm": 0.22791109755583255, + "learning_rate": 3.318373533394735e-05, + "loss": 0.4339, + "step": 977 + }, + { + "epoch": 3.084972462627852, + "grad_norm": 0.2097976976701724, + "learning_rate": 3.3169315324932276e-05, + "loss": 0.4315, + "step": 978 + }, + { + "epoch": 3.088119590873328, + "grad_norm": 0.18718087218415436, + "learning_rate": 3.3154883650009584e-05, + "loss": 0.4311, + "step": 979 + }, + { + "epoch": 3.091266719118804, + "grad_norm": 0.20933947694391003, + "learning_rate": 3.314044032425258e-05, + "loss": 0.4391, + "step": 980 + }, + { + "epoch": 3.09441384736428, + "grad_norm": 0.22529123099122955, + "learning_rate": 3.3125985362746745e-05, + "loss": 0.4262, + "step": 981 + }, + { + "epoch": 3.097560975609756, + "grad_norm": 0.1786820765800306, + "learning_rate": 3.3111518780589723e-05, + "loss": 0.4397, + "step": 982 + }, + { + "epoch": 3.100708103855232, + "grad_norm": 0.22962197514376026, + "learning_rate": 3.3097040592891284e-05, + "loss": 0.4308, + "step": 983 + }, + { + "epoch": 3.1038552321007082, + "grad_norm": 0.24051879415776045, + "learning_rate": 3.30825508147733e-05, + "loss": 0.4318, + "step": 984 + }, + { + "epoch": 3.107002360346184, + "grad_norm": 0.20025474171903923, + "learning_rate": 3.30680494613698e-05, + "loss": 0.4384, + "step": 985 + }, + { + "epoch": 3.1101494885916603, + "grad_norm": 0.19206558390260614, + "learning_rate": 3.305353654782687e-05, + "loss": 0.4297, + "step": 986 + }, + { + "epoch": 3.113296616837136, + "grad_norm": 0.20782219172064703, + "learning_rate": 3.303901208930266e-05, + "loss": 0.4231, + "step": 987 + }, + { + "epoch": 3.116443745082612, + "grad_norm": 0.20010226717176327, + "learning_rate": 3.30244761009674e-05, + "loss": 0.4254, + "step": 988 + }, + { + "epoch": 3.119590873328088, + "grad_norm": 0.18984132712344276, + "learning_rate": 3.300992859800336e-05, + "loss": 0.4244, + "step": 989 + }, + { + "epoch": 3.122738001573564, + "grad_norm": 0.23519299020760812, + "learning_rate": 3.299536959560481e-05, + "loss": 0.4365, + "step": 990 + }, + { + "epoch": 3.12588512981904, + "grad_norm": 0.1950168518008789, + "learning_rate": 3.2980799108978065e-05, + "loss": 0.434, + "step": 991 + }, + { + "epoch": 3.129032258064516, + "grad_norm": 0.17931137612435868, + "learning_rate": 3.296621715334143e-05, + "loss": 0.4321, + "step": 992 + }, + { + "epoch": 3.1321793863099923, + "grad_norm": 0.20090892857586998, + "learning_rate": 3.295162374392518e-05, + "loss": 0.4205, + "step": 993 + }, + { + "epoch": 3.135326514555468, + "grad_norm": 0.18545062281039967, + "learning_rate": 3.293701889597153e-05, + "loss": 0.4289, + "step": 994 + }, + { + "epoch": 3.1384736428009443, + "grad_norm": 0.23236278068083843, + "learning_rate": 3.292240262473469e-05, + "loss": 0.4268, + "step": 995 + }, + { + "epoch": 3.14162077104642, + "grad_norm": 0.2119237430411335, + "learning_rate": 3.290777494548075e-05, + "loss": 0.4361, + "step": 996 + }, + { + "epoch": 3.144767899291896, + "grad_norm": 0.18687999372723774, + "learning_rate": 3.289313587348778e-05, + "loss": 0.4285, + "step": 997 + }, + { + "epoch": 3.147915027537372, + "grad_norm": 0.242430728271403, + "learning_rate": 3.287848542404568e-05, + "loss": 0.4322, + "step": 998 + }, + { + "epoch": 3.151062155782848, + "grad_norm": 0.20753524478639734, + "learning_rate": 3.2863823612456264e-05, + "loss": 0.4286, + "step": 999 + }, + { + "epoch": 3.1542092840283242, + "grad_norm": 0.2204970218249786, + "learning_rate": 3.284915045403325e-05, + "loss": 0.4213, + "step": 1000 + }, + { + "epoch": 3.1573564122738, + "grad_norm": 0.2077798788084683, + "learning_rate": 3.283446596410212e-05, + "loss": 0.4243, + "step": 1001 + }, + { + "epoch": 3.1605035405192763, + "grad_norm": 0.20913787484273705, + "learning_rate": 3.281977015800028e-05, + "loss": 0.4349, + "step": 1002 + }, + { + "epoch": 3.163650668764752, + "grad_norm": 0.1845966280590288, + "learning_rate": 3.28050630510769e-05, + "loss": 0.4367, + "step": 1003 + }, + { + "epoch": 3.1667977970102283, + "grad_norm": 0.19783164507955084, + "learning_rate": 3.279034465869298e-05, + "loss": 0.4256, + "step": 1004 + }, + { + "epoch": 3.169944925255704, + "grad_norm": 0.2645215067092485, + "learning_rate": 3.277561499622129e-05, + "loss": 0.4358, + "step": 1005 + }, + { + "epoch": 3.17309205350118, + "grad_norm": 0.17753403080442126, + "learning_rate": 3.276087407904639e-05, + "loss": 0.4298, + "step": 1006 + }, + { + "epoch": 3.176239181746656, + "grad_norm": 0.23434775817714046, + "learning_rate": 3.274612192256457e-05, + "loss": 0.4328, + "step": 1007 + }, + { + "epoch": 3.179386309992132, + "grad_norm": 0.18670646442316036, + "learning_rate": 3.273135854218389e-05, + "loss": 0.4289, + "step": 1008 + }, + { + "epoch": 3.1825334382376083, + "grad_norm": 0.18890780180865613, + "learning_rate": 3.2716583953324094e-05, + "loss": 0.4377, + "step": 1009 + }, + { + "epoch": 3.185680566483084, + "grad_norm": 0.21371681553609642, + "learning_rate": 3.2701798171416674e-05, + "loss": 0.4315, + "step": 1010 + }, + { + "epoch": 3.1888276947285603, + "grad_norm": 0.1965403832948832, + "learning_rate": 3.268700121190479e-05, + "loss": 0.4349, + "step": 1011 + }, + { + "epoch": 3.191974822974036, + "grad_norm": 0.19377735620113185, + "learning_rate": 3.267219309024328e-05, + "loss": 0.427, + "step": 1012 + }, + { + "epoch": 3.1951219512195124, + "grad_norm": 0.18770678346838374, + "learning_rate": 3.265737382189863e-05, + "loss": 0.4267, + "step": 1013 + }, + { + "epoch": 3.198269079464988, + "grad_norm": 0.2042394668976984, + "learning_rate": 3.2642543422349e-05, + "loss": 0.4385, + "step": 1014 + }, + { + "epoch": 3.201416207710464, + "grad_norm": 0.19338325740387202, + "learning_rate": 3.2627701907084136e-05, + "loss": 0.4232, + "step": 1015 + }, + { + "epoch": 3.2045633359559402, + "grad_norm": 0.19512605168524141, + "learning_rate": 3.2612849291605425e-05, + "loss": 0.4291, + "step": 1016 + }, + { + "epoch": 3.207710464201416, + "grad_norm": 0.18182108276136827, + "learning_rate": 3.259798559142583e-05, + "loss": 0.4339, + "step": 1017 + }, + { + "epoch": 3.2108575924468923, + "grad_norm": 0.17815282317494613, + "learning_rate": 3.2583110822069894e-05, + "loss": 0.4348, + "step": 1018 + }, + { + "epoch": 3.214004720692368, + "grad_norm": 0.19146604497628397, + "learning_rate": 3.2568224999073725e-05, + "loss": 0.4253, + "step": 1019 + }, + { + "epoch": 3.2171518489378443, + "grad_norm": 0.17868445991882773, + "learning_rate": 3.255332813798499e-05, + "loss": 0.4366, + "step": 1020 + }, + { + "epoch": 3.22029897718332, + "grad_norm": 0.22039699239981317, + "learning_rate": 3.253842025436286e-05, + "loss": 0.4288, + "step": 1021 + }, + { + "epoch": 3.2234461054287964, + "grad_norm": 0.1801947638201736, + "learning_rate": 3.252350136377802e-05, + "loss": 0.4271, + "step": 1022 + }, + { + "epoch": 3.226593233674272, + "grad_norm": 0.1849719695926772, + "learning_rate": 3.2508571481812686e-05, + "loss": 0.4305, + "step": 1023 + }, + { + "epoch": 3.229740361919748, + "grad_norm": 0.17788240132478986, + "learning_rate": 3.2493630624060494e-05, + "loss": 0.4402, + "step": 1024 + }, + { + "epoch": 3.2328874901652243, + "grad_norm": 0.1845071818835128, + "learning_rate": 3.2478678806126614e-05, + "loss": 0.4389, + "step": 1025 + }, + { + "epoch": 3.2360346184107, + "grad_norm": 0.1858623426241377, + "learning_rate": 3.24637160436276e-05, + "loss": 0.4339, + "step": 1026 + }, + { + "epoch": 3.2391817466561763, + "grad_norm": 0.17220083563874175, + "learning_rate": 3.2448742352191476e-05, + "loss": 0.4331, + "step": 1027 + }, + { + "epoch": 3.242328874901652, + "grad_norm": 0.2247895258402866, + "learning_rate": 3.243375774745768e-05, + "loss": 0.4289, + "step": 1028 + }, + { + "epoch": 3.2454760031471284, + "grad_norm": 0.16684480973484458, + "learning_rate": 3.241876224507702e-05, + "loss": 0.4334, + "step": 1029 + }, + { + "epoch": 3.248623131392604, + "grad_norm": 0.1779584572036571, + "learning_rate": 3.240375586071171e-05, + "loss": 0.4399, + "step": 1030 + }, + { + "epoch": 3.2517702596380804, + "grad_norm": 0.191113784730864, + "learning_rate": 3.238873861003533e-05, + "loss": 0.4367, + "step": 1031 + }, + { + "epoch": 3.2549173878835562, + "grad_norm": 0.18550711603052794, + "learning_rate": 3.237371050873281e-05, + "loss": 0.4406, + "step": 1032 + }, + { + "epoch": 3.258064516129032, + "grad_norm": 0.1783887143847433, + "learning_rate": 3.235867157250039e-05, + "loss": 0.4418, + "step": 1033 + }, + { + "epoch": 3.2612116443745083, + "grad_norm": 0.17835625882815775, + "learning_rate": 3.234362181704565e-05, + "loss": 0.4295, + "step": 1034 + }, + { + "epoch": 3.264358772619984, + "grad_norm": 0.18249704241735015, + "learning_rate": 3.232856125808746e-05, + "loss": 0.4294, + "step": 1035 + }, + { + "epoch": 3.2675059008654603, + "grad_norm": 0.17475471863362183, + "learning_rate": 3.231348991135599e-05, + "loss": 0.4364, + "step": 1036 + }, + { + "epoch": 3.270653029110936, + "grad_norm": 0.19935489428291137, + "learning_rate": 3.229840779259266e-05, + "loss": 0.4255, + "step": 1037 + }, + { + "epoch": 3.2738001573564124, + "grad_norm": 0.1693521531229919, + "learning_rate": 3.2283314917550136e-05, + "loss": 0.4359, + "step": 1038 + }, + { + "epoch": 3.276947285601888, + "grad_norm": 0.19130787480398617, + "learning_rate": 3.226821130199233e-05, + "loss": 0.431, + "step": 1039 + }, + { + "epoch": 3.2800944138473644, + "grad_norm": 0.16737976497369075, + "learning_rate": 3.225309696169438e-05, + "loss": 0.4311, + "step": 1040 + }, + { + "epoch": 3.2832415420928402, + "grad_norm": 0.18392433219239301, + "learning_rate": 3.223797191244261e-05, + "loss": 0.4356, + "step": 1041 + }, + { + "epoch": 3.286388670338316, + "grad_norm": 0.19864523441807563, + "learning_rate": 3.2222836170034543e-05, + "loss": 0.4247, + "step": 1042 + }, + { + "epoch": 3.2895357985837923, + "grad_norm": 0.1984089785922852, + "learning_rate": 3.220768975027886e-05, + "loss": 0.4374, + "step": 1043 + }, + { + "epoch": 3.292682926829268, + "grad_norm": 0.19402343905084715, + "learning_rate": 3.2192532668995385e-05, + "loss": 0.4254, + "step": 1044 + }, + { + "epoch": 3.2958300550747444, + "grad_norm": 0.2041163961571167, + "learning_rate": 3.21773649420151e-05, + "loss": 0.4358, + "step": 1045 + }, + { + "epoch": 3.29897718332022, + "grad_norm": 0.1787832950909068, + "learning_rate": 3.2162186585180095e-05, + "loss": 0.4231, + "step": 1046 + }, + { + "epoch": 3.3021243115656964, + "grad_norm": 0.19740027959624745, + "learning_rate": 3.214699761434355e-05, + "loss": 0.4302, + "step": 1047 + }, + { + "epoch": 3.305271439811172, + "grad_norm": 0.18856045951279926, + "learning_rate": 3.2131798045369765e-05, + "loss": 0.4308, + "step": 1048 + }, + { + "epoch": 3.3084185680566485, + "grad_norm": 0.20807270153545412, + "learning_rate": 3.211658789413408e-05, + "loss": 0.4351, + "step": 1049 + }, + { + "epoch": 3.3115656963021243, + "grad_norm": 0.19162342835863225, + "learning_rate": 3.2101367176522886e-05, + "loss": 0.4354, + "step": 1050 + }, + { + "epoch": 3.3147128245476, + "grad_norm": 0.19387129843147574, + "learning_rate": 3.2086135908433634e-05, + "loss": 0.43, + "step": 1051 + }, + { + "epoch": 3.3178599527930763, + "grad_norm": 0.18150899186866062, + "learning_rate": 3.2070894105774766e-05, + "loss": 0.4344, + "step": 1052 + }, + { + "epoch": 3.321007081038552, + "grad_norm": 0.21537331116177905, + "learning_rate": 3.2055641784465745e-05, + "loss": 0.4415, + "step": 1053 + }, + { + "epoch": 3.3241542092840284, + "grad_norm": 0.2400691954661002, + "learning_rate": 3.2040378960437024e-05, + "loss": 0.4406, + "step": 1054 + }, + { + "epoch": 3.327301337529504, + "grad_norm": 0.19817600331351765, + "learning_rate": 3.2025105649630014e-05, + "loss": 0.4315, + "step": 1055 + }, + { + "epoch": 3.3304484657749804, + "grad_norm": 0.2084827766948241, + "learning_rate": 3.200982186799709e-05, + "loss": 0.4187, + "step": 1056 + }, + { + "epoch": 3.3335955940204562, + "grad_norm": 0.22112170756734634, + "learning_rate": 3.199452763150155e-05, + "loss": 0.4315, + "step": 1057 + }, + { + "epoch": 3.3367427222659325, + "grad_norm": 0.19512676684498764, + "learning_rate": 3.197922295611762e-05, + "loss": 0.4345, + "step": 1058 + }, + { + "epoch": 3.3398898505114083, + "grad_norm": 0.2663842173534082, + "learning_rate": 3.196390785783043e-05, + "loss": 0.4346, + "step": 1059 + }, + { + "epoch": 3.343036978756884, + "grad_norm": 0.21826809183094342, + "learning_rate": 3.194858235263598e-05, + "loss": 0.4355, + "step": 1060 + }, + { + "epoch": 3.3461841070023604, + "grad_norm": 0.20953163723288945, + "learning_rate": 3.193324645654118e-05, + "loss": 0.4301, + "step": 1061 + }, + { + "epoch": 3.349331235247836, + "grad_norm": 0.24320015638205855, + "learning_rate": 3.191790018556373e-05, + "loss": 0.4425, + "step": 1062 + }, + { + "epoch": 3.3524783634933124, + "grad_norm": 0.1904143985171064, + "learning_rate": 3.190254355573223e-05, + "loss": 0.4378, + "step": 1063 + }, + { + "epoch": 3.355625491738788, + "grad_norm": 0.22691576279072934, + "learning_rate": 3.1887176583086066e-05, + "loss": 0.4263, + "step": 1064 + }, + { + "epoch": 3.3587726199842645, + "grad_norm": 0.21161650205231633, + "learning_rate": 3.187179928367544e-05, + "loss": 0.4251, + "step": 1065 + }, + { + "epoch": 3.3619197482297403, + "grad_norm": 0.20609048888260278, + "learning_rate": 3.185641167356131e-05, + "loss": 0.4283, + "step": 1066 + }, + { + "epoch": 3.3650668764752165, + "grad_norm": 0.2065246689397034, + "learning_rate": 3.184101376881545e-05, + "loss": 0.4292, + "step": 1067 + }, + { + "epoch": 3.3682140047206923, + "grad_norm": 0.21953821452618635, + "learning_rate": 3.1825605585520343e-05, + "loss": 0.4334, + "step": 1068 + }, + { + "epoch": 3.371361132966168, + "grad_norm": 0.19911994569988575, + "learning_rate": 3.181018713976924e-05, + "loss": 0.4286, + "step": 1069 + }, + { + "epoch": 3.3745082612116444, + "grad_norm": 0.24900209916902774, + "learning_rate": 3.179475844766608e-05, + "loss": 0.4332, + "step": 1070 + }, + { + "epoch": 3.3776553894571206, + "grad_norm": 0.23150612649346244, + "learning_rate": 3.1779319525325546e-05, + "loss": 0.4268, + "step": 1071 + }, + { + "epoch": 3.3808025177025964, + "grad_norm": 0.2158927674109808, + "learning_rate": 3.176387038887296e-05, + "loss": 0.4462, + "step": 1072 + }, + { + "epoch": 3.3839496459480722, + "grad_norm": 0.24924200720982193, + "learning_rate": 3.174841105444434e-05, + "loss": 0.4408, + "step": 1073 + }, + { + "epoch": 3.3870967741935485, + "grad_norm": 0.2239815418692921, + "learning_rate": 3.173294153818635e-05, + "loss": 0.4326, + "step": 1074 + }, + { + "epoch": 3.3902439024390243, + "grad_norm": 0.2199325533660836, + "learning_rate": 3.17174618562563e-05, + "loss": 0.4312, + "step": 1075 + }, + { + "epoch": 3.3933910306845005, + "grad_norm": 0.22972172889576944, + "learning_rate": 3.170197202482208e-05, + "loss": 0.4343, + "step": 1076 + }, + { + "epoch": 3.3965381589299763, + "grad_norm": 0.2094072355653265, + "learning_rate": 3.168647206006221e-05, + "loss": 0.4362, + "step": 1077 + }, + { + "epoch": 3.399685287175452, + "grad_norm": 0.19900277532000674, + "learning_rate": 3.167096197816581e-05, + "loss": 0.4346, + "step": 1078 + }, + { + "epoch": 3.4028324154209284, + "grad_norm": 0.19471319908950105, + "learning_rate": 3.1655441795332523e-05, + "loss": 0.434, + "step": 1079 + }, + { + "epoch": 3.4059795436664047, + "grad_norm": 0.24652320632202052, + "learning_rate": 3.163991152777259e-05, + "loss": 0.4446, + "step": 1080 + }, + { + "epoch": 3.4091266719118805, + "grad_norm": 0.18487872132027222, + "learning_rate": 3.162437119170673e-05, + "loss": 0.4428, + "step": 1081 + }, + { + "epoch": 3.4122738001573563, + "grad_norm": 0.18070644674221434, + "learning_rate": 3.160882080336624e-05, + "loss": 0.4345, + "step": 1082 + }, + { + "epoch": 3.4154209284028325, + "grad_norm": 0.20214727431927682, + "learning_rate": 3.1593260378992856e-05, + "loss": 0.4393, + "step": 1083 + }, + { + "epoch": 3.4185680566483083, + "grad_norm": 0.19354862366840503, + "learning_rate": 3.1577689934838847e-05, + "loss": 0.4286, + "step": 1084 + }, + { + "epoch": 3.4217151848937846, + "grad_norm": 0.17360041262575412, + "learning_rate": 3.156210948716691e-05, + "loss": 0.4395, + "step": 1085 + }, + { + "epoch": 3.4248623131392604, + "grad_norm": 0.21602699476201023, + "learning_rate": 3.1546519052250216e-05, + "loss": 0.4363, + "step": 1086 + }, + { + "epoch": 3.4280094413847366, + "grad_norm": 0.19400691100542458, + "learning_rate": 3.153091864637236e-05, + "loss": 0.4465, + "step": 1087 + }, + { + "epoch": 3.4311565696302124, + "grad_norm": 0.18853705117037867, + "learning_rate": 3.151530828582734e-05, + "loss": 0.4367, + "step": 1088 + }, + { + "epoch": 3.4343036978756887, + "grad_norm": 0.20156628021291076, + "learning_rate": 3.149968798691956e-05, + "loss": 0.4312, + "step": 1089 + }, + { + "epoch": 3.4374508261211645, + "grad_norm": 0.17508099200640428, + "learning_rate": 3.148405776596381e-05, + "loss": 0.4387, + "step": 1090 + }, + { + "epoch": 3.4405979543666403, + "grad_norm": 0.19428180773708023, + "learning_rate": 3.1468417639285234e-05, + "loss": 0.4372, + "step": 1091 + }, + { + "epoch": 3.4437450826121165, + "grad_norm": 0.18564695290847033, + "learning_rate": 3.145276762321932e-05, + "loss": 0.4372, + "step": 1092 + }, + { + "epoch": 3.4468922108575923, + "grad_norm": 0.18386558110382897, + "learning_rate": 3.1437107734111885e-05, + "loss": 0.4303, + "step": 1093 + }, + { + "epoch": 3.4500393391030686, + "grad_norm": 0.19976923603089122, + "learning_rate": 3.142143798831908e-05, + "loss": 0.4387, + "step": 1094 + }, + { + "epoch": 3.4531864673485444, + "grad_norm": 0.1967807399314723, + "learning_rate": 3.140575840220733e-05, + "loss": 0.4422, + "step": 1095 + }, + { + "epoch": 3.4563335955940206, + "grad_norm": 0.18143971647103654, + "learning_rate": 3.1390068992153336e-05, + "loss": 0.4427, + "step": 1096 + }, + { + "epoch": 3.4594807238394965, + "grad_norm": 0.19909894071095616, + "learning_rate": 3.137436977454406e-05, + "loss": 0.4413, + "step": 1097 + }, + { + "epoch": 3.4626278520849727, + "grad_norm": 0.17902127709322027, + "learning_rate": 3.135866076577673e-05, + "loss": 0.4408, + "step": 1098 + }, + { + "epoch": 3.4657749803304485, + "grad_norm": 0.1874350511676143, + "learning_rate": 3.134294198225877e-05, + "loss": 0.4458, + "step": 1099 + }, + { + "epoch": 3.4689221085759243, + "grad_norm": 0.19777946007747293, + "learning_rate": 3.132721344040783e-05, + "loss": 0.4363, + "step": 1100 + }, + { + "epoch": 3.4720692368214006, + "grad_norm": 0.21042566361266743, + "learning_rate": 3.1311475156651755e-05, + "loss": 0.4287, + "step": 1101 + }, + { + "epoch": 3.4752163650668764, + "grad_norm": 0.18234625120759887, + "learning_rate": 3.129572714742855e-05, + "loss": 0.4389, + "step": 1102 + }, + { + "epoch": 3.4783634933123526, + "grad_norm": 0.2094029102938534, + "learning_rate": 3.12799694291864e-05, + "loss": 0.4306, + "step": 1103 + }, + { + "epoch": 3.4815106215578284, + "grad_norm": 0.16484345005981205, + "learning_rate": 3.12642020183836e-05, + "loss": 0.4322, + "step": 1104 + }, + { + "epoch": 3.4846577498033047, + "grad_norm": 0.2218688219231824, + "learning_rate": 3.12484249314886e-05, + "loss": 0.4313, + "step": 1105 + }, + { + "epoch": 3.4878048780487805, + "grad_norm": 0.18992515126275933, + "learning_rate": 3.1232638184979934e-05, + "loss": 0.4378, + "step": 1106 + }, + { + "epoch": 3.4909520062942567, + "grad_norm": 0.1871055104215194, + "learning_rate": 3.1216841795346246e-05, + "loss": 0.4303, + "step": 1107 + }, + { + "epoch": 3.4940991345397325, + "grad_norm": 0.19199964417557105, + "learning_rate": 3.120103577908623e-05, + "loss": 0.441, + "step": 1108 + }, + { + "epoch": 3.4972462627852083, + "grad_norm": 0.1856197404311817, + "learning_rate": 3.1185220152708645e-05, + "loss": 0.4327, + "step": 1109 + }, + { + "epoch": 3.5003933910306846, + "grad_norm": 0.1770808469670125, + "learning_rate": 3.116939493273228e-05, + "loss": 0.4379, + "step": 1110 + }, + { + "epoch": 3.5035405192761604, + "grad_norm": 0.17080394749356279, + "learning_rate": 3.115356013568597e-05, + "loss": 0.434, + "step": 1111 + }, + { + "epoch": 3.5066876475216366, + "grad_norm": 0.19343264397189958, + "learning_rate": 3.113771577810852e-05, + "loss": 0.4349, + "step": 1112 + }, + { + "epoch": 3.5098347757671124, + "grad_norm": 0.17286156111192222, + "learning_rate": 3.1121861876548736e-05, + "loss": 0.443, + "step": 1113 + }, + { + "epoch": 3.5129819040125883, + "grad_norm": 0.19138302591060105, + "learning_rate": 3.1105998447565383e-05, + "loss": 0.4447, + "step": 1114 + }, + { + "epoch": 3.5161290322580645, + "grad_norm": 0.16413512248871734, + "learning_rate": 3.10901255077272e-05, + "loss": 0.4468, + "step": 1115 + }, + { + "epoch": 3.5192761605035408, + "grad_norm": 0.1914477065729763, + "learning_rate": 3.1074243073612834e-05, + "loss": 0.4309, + "step": 1116 + }, + { + "epoch": 3.5224232887490166, + "grad_norm": 0.188288353753066, + "learning_rate": 3.105835116181086e-05, + "loss": 0.4355, + "step": 1117 + }, + { + "epoch": 3.5255704169944924, + "grad_norm": 0.17465184377745524, + "learning_rate": 3.104244978891975e-05, + "loss": 0.4355, + "step": 1118 + }, + { + "epoch": 3.5287175452399686, + "grad_norm": 0.1825953673131463, + "learning_rate": 3.102653897154786e-05, + "loss": 0.4316, + "step": 1119 + }, + { + "epoch": 3.5318646734854444, + "grad_norm": 0.1743986993607113, + "learning_rate": 3.1010618726313405e-05, + "loss": 0.4331, + "step": 1120 + }, + { + "epoch": 3.5350118017309207, + "grad_norm": 0.17821957810877814, + "learning_rate": 3.099468906984446e-05, + "loss": 0.4345, + "step": 1121 + }, + { + "epoch": 3.5381589299763965, + "grad_norm": 0.2093960838490045, + "learning_rate": 3.097875001877891e-05, + "loss": 0.4387, + "step": 1122 + }, + { + "epoch": 3.5413060582218723, + "grad_norm": 0.17177569091014847, + "learning_rate": 3.0962801589764474e-05, + "loss": 0.4282, + "step": 1123 + }, + { + "epoch": 3.5444531864673485, + "grad_norm": 0.1748649502758884, + "learning_rate": 3.094684379945865e-05, + "loss": 0.434, + "step": 1124 + }, + { + "epoch": 3.5476003147128248, + "grad_norm": 0.19080807496580413, + "learning_rate": 3.093087666452871e-05, + "loss": 0.4386, + "step": 1125 + }, + { + "epoch": 3.5507474429583006, + "grad_norm": 0.17652468445293854, + "learning_rate": 3.09149002016517e-05, + "loss": 0.4391, + "step": 1126 + }, + { + "epoch": 3.5538945712037764, + "grad_norm": 0.19062712789092418, + "learning_rate": 3.08989144275144e-05, + "loss": 0.43, + "step": 1127 + }, + { + "epoch": 3.5570416994492526, + "grad_norm": 0.17546579858596842, + "learning_rate": 3.088291935881333e-05, + "loss": 0.435, + "step": 1128 + }, + { + "epoch": 3.5601888276947284, + "grad_norm": 0.21065753936700307, + "learning_rate": 3.08669150122547e-05, + "loss": 0.4233, + "step": 1129 + }, + { + "epoch": 3.5633359559402047, + "grad_norm": 0.16676658656556034, + "learning_rate": 3.0850901404554404e-05, + "loss": 0.4419, + "step": 1130 + }, + { + "epoch": 3.5664830841856805, + "grad_norm": 0.2075035100795957, + "learning_rate": 3.083487855243804e-05, + "loss": 0.4374, + "step": 1131 + }, + { + "epoch": 3.5696302124311563, + "grad_norm": 0.16571698042635005, + "learning_rate": 3.081884647264083e-05, + "loss": 0.4385, + "step": 1132 + }, + { + "epoch": 3.5727773406766326, + "grad_norm": 0.19707520555123104, + "learning_rate": 3.080280518190765e-05, + "loss": 0.4445, + "step": 1133 + }, + { + "epoch": 3.575924468922109, + "grad_norm": 0.17132281905111424, + "learning_rate": 3.078675469699299e-05, + "loss": 0.4379, + "step": 1134 + }, + { + "epoch": 3.5790715971675846, + "grad_norm": 0.1865379519918738, + "learning_rate": 3.077069503466095e-05, + "loss": 0.4324, + "step": 1135 + }, + { + "epoch": 3.5822187254130604, + "grad_norm": 0.20303319475170387, + "learning_rate": 3.075462621168521e-05, + "loss": 0.4335, + "step": 1136 + }, + { + "epoch": 3.5853658536585367, + "grad_norm": 0.1769716572277089, + "learning_rate": 3.0738548244849024e-05, + "loss": 0.4414, + "step": 1137 + }, + { + "epoch": 3.5885129819040125, + "grad_norm": 0.2119201910437755, + "learning_rate": 3.072246115094519e-05, + "loss": 0.4347, + "step": 1138 + }, + { + "epoch": 3.5916601101494887, + "grad_norm": 0.18291536465188502, + "learning_rate": 3.070636494677603e-05, + "loss": 0.4297, + "step": 1139 + }, + { + "epoch": 3.5948072383949645, + "grad_norm": 0.2018907273302855, + "learning_rate": 3.0690259649153414e-05, + "loss": 0.4369, + "step": 1140 + }, + { + "epoch": 3.5979543666404403, + "grad_norm": 0.18931019337202662, + "learning_rate": 3.067414527489866e-05, + "loss": 0.4385, + "step": 1141 + }, + { + "epoch": 3.6011014948859166, + "grad_norm": 0.17894402262594664, + "learning_rate": 3.0658021840842615e-05, + "loss": 0.4317, + "step": 1142 + }, + { + "epoch": 3.604248623131393, + "grad_norm": 0.19224386856066833, + "learning_rate": 3.0641889363825566e-05, + "loss": 0.4295, + "step": 1143 + }, + { + "epoch": 3.6073957513768686, + "grad_norm": 0.1823104308503007, + "learning_rate": 3.062574786069723e-05, + "loss": 0.4381, + "step": 1144 + }, + { + "epoch": 3.6105428796223444, + "grad_norm": 0.1845807265379901, + "learning_rate": 3.0609597348316784e-05, + "loss": 0.4443, + "step": 1145 + }, + { + "epoch": 3.6136900078678207, + "grad_norm": 0.1752786351405681, + "learning_rate": 3.05934378435528e-05, + "loss": 0.4269, + "step": 1146 + }, + { + "epoch": 3.6168371361132965, + "grad_norm": 0.17650135711502488, + "learning_rate": 3.057726936328323e-05, + "loss": 0.4344, + "step": 1147 + }, + { + "epoch": 3.6199842643587727, + "grad_norm": 0.19322880994465225, + "learning_rate": 3.056109192439541e-05, + "loss": 0.4286, + "step": 1148 + }, + { + "epoch": 3.6231313926042485, + "grad_norm": 0.1664813830978989, + "learning_rate": 3.0544905543786045e-05, + "loss": 0.434, + "step": 1149 + }, + { + "epoch": 3.6262785208497244, + "grad_norm": 0.19170305910822624, + "learning_rate": 3.052871023836116e-05, + "loss": 0.4432, + "step": 1150 + }, + { + "epoch": 3.6294256490952006, + "grad_norm": 0.1854811898273995, + "learning_rate": 3.051250602503612e-05, + "loss": 0.4335, + "step": 1151 + }, + { + "epoch": 3.632572777340677, + "grad_norm": 0.1822902853195308, + "learning_rate": 3.0496292920735574e-05, + "loss": 0.4397, + "step": 1152 + }, + { + "epoch": 3.6357199055861527, + "grad_norm": 0.16630950273159906, + "learning_rate": 3.0480070942393483e-05, + "loss": 0.4441, + "step": 1153 + }, + { + "epoch": 3.6388670338316285, + "grad_norm": 0.1661679586279354, + "learning_rate": 3.046384010695304e-05, + "loss": 0.4394, + "step": 1154 + }, + { + "epoch": 3.6420141620771047, + "grad_norm": 0.1564352636813857, + "learning_rate": 3.0447600431366724e-05, + "loss": 0.4438, + "step": 1155 + }, + { + "epoch": 3.6451612903225805, + "grad_norm": 0.17161921802692476, + "learning_rate": 3.043135193259623e-05, + "loss": 0.4343, + "step": 1156 + }, + { + "epoch": 3.6483084185680568, + "grad_norm": 0.18351798204850334, + "learning_rate": 3.0415094627612464e-05, + "loss": 0.4402, + "step": 1157 + }, + { + "epoch": 3.6514555468135326, + "grad_norm": 0.17135389498561554, + "learning_rate": 3.0398828533395547e-05, + "loss": 0.4324, + "step": 1158 + }, + { + "epoch": 3.654602675059009, + "grad_norm": 0.19814568295329985, + "learning_rate": 3.0382553666934777e-05, + "loss": 0.4418, + "step": 1159 + }, + { + "epoch": 3.6577498033044846, + "grad_norm": 0.17713962766442853, + "learning_rate": 3.036627004522859e-05, + "loss": 0.4258, + "step": 1160 + }, + { + "epoch": 3.660896931549961, + "grad_norm": 0.17627230414185083, + "learning_rate": 3.0349977685284596e-05, + "loss": 0.437, + "step": 1161 + }, + { + "epoch": 3.6640440597954367, + "grad_norm": 0.21033833731933352, + "learning_rate": 3.0333676604119512e-05, + "loss": 0.4359, + "step": 1162 + }, + { + "epoch": 3.6671911880409125, + "grad_norm": 0.16631632754069195, + "learning_rate": 3.0317366818759183e-05, + "loss": 0.4416, + "step": 1163 + }, + { + "epoch": 3.6703383162863887, + "grad_norm": 0.16042229001940653, + "learning_rate": 3.0301048346238522e-05, + "loss": 0.4332, + "step": 1164 + }, + { + "epoch": 3.6734854445318645, + "grad_norm": 0.1681130029892185, + "learning_rate": 3.028472120360153e-05, + "loss": 0.4435, + "step": 1165 + }, + { + "epoch": 3.676632572777341, + "grad_norm": 0.15489295343278095, + "learning_rate": 3.0268385407901267e-05, + "loss": 0.4301, + "step": 1166 + }, + { + "epoch": 3.6797797010228166, + "grad_norm": 0.1856105350391872, + "learning_rate": 3.025204097619982e-05, + "loss": 0.4384, + "step": 1167 + }, + { + "epoch": 3.682926829268293, + "grad_norm": 0.17420244127013473, + "learning_rate": 3.0235687925568308e-05, + "loss": 0.4474, + "step": 1168 + }, + { + "epoch": 3.6860739575137687, + "grad_norm": 0.16283454261310373, + "learning_rate": 3.021932627308684e-05, + "loss": 0.446, + "step": 1169 + }, + { + "epoch": 3.689221085759245, + "grad_norm": 0.16652733620538568, + "learning_rate": 3.020295603584451e-05, + "loss": 0.4385, + "step": 1170 + }, + { + "epoch": 3.6923682140047207, + "grad_norm": 0.16838451835416196, + "learning_rate": 3.0186577230939383e-05, + "loss": 0.4383, + "step": 1171 + }, + { + "epoch": 3.6955153422501965, + "grad_norm": 0.20256218511665658, + "learning_rate": 3.017018987547848e-05, + "loss": 0.4468, + "step": 1172 + }, + { + "epoch": 3.6986624704956728, + "grad_norm": 0.1789463978366356, + "learning_rate": 3.015379398657774e-05, + "loss": 0.4436, + "step": 1173 + }, + { + "epoch": 3.7018095987411486, + "grad_norm": 0.2254816031556007, + "learning_rate": 3.0137389581362012e-05, + "loss": 0.4402, + "step": 1174 + }, + { + "epoch": 3.704956726986625, + "grad_norm": 0.19908506142514173, + "learning_rate": 3.0120976676965065e-05, + "loss": 0.437, + "step": 1175 + }, + { + "epoch": 3.7081038552321006, + "grad_norm": 0.17538124081020218, + "learning_rate": 3.010455529052952e-05, + "loss": 0.4495, + "step": 1176 + }, + { + "epoch": 3.711250983477577, + "grad_norm": 0.20509162893468286, + "learning_rate": 3.0088125439206854e-05, + "loss": 0.4432, + "step": 1177 + }, + { + "epoch": 3.7143981117230527, + "grad_norm": 0.18192579811095336, + "learning_rate": 3.0071687140157413e-05, + "loss": 0.4388, + "step": 1178 + }, + { + "epoch": 3.717545239968529, + "grad_norm": 0.21503535709810237, + "learning_rate": 3.005524041055034e-05, + "loss": 0.4351, + "step": 1179 + }, + { + "epoch": 3.7206923682140047, + "grad_norm": 0.17400711170435434, + "learning_rate": 3.00387852675636e-05, + "loss": 0.4492, + "step": 1180 + }, + { + "epoch": 3.7238394964594805, + "grad_norm": 0.24313295347800967, + "learning_rate": 3.0022321728383933e-05, + "loss": 0.4315, + "step": 1181 + }, + { + "epoch": 3.726986624704957, + "grad_norm": 0.1478818146322245, + "learning_rate": 3.0005849810206845e-05, + "loss": 0.4363, + "step": 1182 + }, + { + "epoch": 3.7301337529504326, + "grad_norm": 0.20144526501549903, + "learning_rate": 2.9989369530236618e-05, + "loss": 0.4426, + "step": 1183 + }, + { + "epoch": 3.733280881195909, + "grad_norm": 0.18206029920285993, + "learning_rate": 2.9972880905686246e-05, + "loss": 0.4344, + "step": 1184 + }, + { + "epoch": 3.7364280094413846, + "grad_norm": 0.18180540417395807, + "learning_rate": 2.9956383953777442e-05, + "loss": 0.4424, + "step": 1185 + }, + { + "epoch": 3.739575137686861, + "grad_norm": 0.191677223977247, + "learning_rate": 2.9939878691740625e-05, + "loss": 0.4304, + "step": 1186 + }, + { + "epoch": 3.7427222659323367, + "grad_norm": 0.17576860046619955, + "learning_rate": 2.9923365136814876e-05, + "loss": 0.4432, + "step": 1187 + }, + { + "epoch": 3.745869394177813, + "grad_norm": 0.18177898566663142, + "learning_rate": 2.9906843306247965e-05, + "loss": 0.4315, + "step": 1188 + }, + { + "epoch": 3.7490165224232888, + "grad_norm": 0.19227267623614985, + "learning_rate": 2.9890313217296277e-05, + "loss": 0.4368, + "step": 1189 + }, + { + "epoch": 3.7521636506687646, + "grad_norm": 0.19404291468297713, + "learning_rate": 2.9873774887224844e-05, + "loss": 0.4418, + "step": 1190 + }, + { + "epoch": 3.755310778914241, + "grad_norm": 0.17297469013091823, + "learning_rate": 2.985722833330729e-05, + "loss": 0.4276, + "step": 1191 + }, + { + "epoch": 3.7584579071597166, + "grad_norm": 0.21580498854793478, + "learning_rate": 2.984067357282584e-05, + "loss": 0.438, + "step": 1192 + }, + { + "epoch": 3.761605035405193, + "grad_norm": 0.19970366933915254, + "learning_rate": 2.9824110623071285e-05, + "loss": 0.4429, + "step": 1193 + }, + { + "epoch": 3.7647521636506687, + "grad_norm": 0.1893900043304486, + "learning_rate": 2.980753950134297e-05, + "loss": 0.4425, + "step": 1194 + }, + { + "epoch": 3.767899291896145, + "grad_norm": 0.1844133531075253, + "learning_rate": 2.979096022494878e-05, + "loss": 0.4345, + "step": 1195 + }, + { + "epoch": 3.7710464201416207, + "grad_norm": 0.1764704510378753, + "learning_rate": 2.9774372811205104e-05, + "loss": 0.4404, + "step": 1196 + }, + { + "epoch": 3.774193548387097, + "grad_norm": 0.1937035801525317, + "learning_rate": 2.975777727743684e-05, + "loss": 0.4386, + "step": 1197 + }, + { + "epoch": 3.777340676632573, + "grad_norm": 0.18365070141139342, + "learning_rate": 2.9741173640977372e-05, + "loss": 0.4331, + "step": 1198 + }, + { + "epoch": 3.7804878048780486, + "grad_norm": 0.17907574402326445, + "learning_rate": 2.9724561919168536e-05, + "loss": 0.4411, + "step": 1199 + }, + { + "epoch": 3.783634933123525, + "grad_norm": 0.1915338900258077, + "learning_rate": 2.9707942129360622e-05, + "loss": 0.4336, + "step": 1200 + }, + { + "epoch": 3.7867820613690006, + "grad_norm": 0.18426098050440218, + "learning_rate": 2.969131428891234e-05, + "loss": 0.4352, + "step": 1201 + }, + { + "epoch": 3.789929189614477, + "grad_norm": 0.19246246891896052, + "learning_rate": 2.967467841519081e-05, + "loss": 0.4281, + "step": 1202 + }, + { + "epoch": 3.7930763178599527, + "grad_norm": 0.199901468879607, + "learning_rate": 2.9658034525571543e-05, + "loss": 0.4401, + "step": 1203 + }, + { + "epoch": 3.796223446105429, + "grad_norm": 0.17624439473519934, + "learning_rate": 2.964138263743843e-05, + "loss": 0.4343, + "step": 1204 + }, + { + "epoch": 3.7993705743509048, + "grad_norm": 0.20949517161628303, + "learning_rate": 2.96247227681837e-05, + "loss": 0.4284, + "step": 1205 + }, + { + "epoch": 3.802517702596381, + "grad_norm": 0.1841705003012857, + "learning_rate": 2.9608054935207925e-05, + "loss": 0.4392, + "step": 1206 + }, + { + "epoch": 3.805664830841857, + "grad_norm": 0.20023545812113352, + "learning_rate": 2.959137915592e-05, + "loss": 0.4403, + "step": 1207 + }, + { + "epoch": 3.8088119590873326, + "grad_norm": 0.18181285159081859, + "learning_rate": 2.9574695447737126e-05, + "loss": 0.4301, + "step": 1208 + }, + { + "epoch": 3.811959087332809, + "grad_norm": 0.4591999499033323, + "learning_rate": 2.9558003828084768e-05, + "loss": 0.4444, + "step": 1209 + }, + { + "epoch": 3.8151062155782847, + "grad_norm": 0.2085648435684365, + "learning_rate": 2.9541304314396653e-05, + "loss": 0.4325, + "step": 1210 + }, + { + "epoch": 3.818253343823761, + "grad_norm": 0.20463216335646361, + "learning_rate": 2.9524596924114776e-05, + "loss": 0.4345, + "step": 1211 + }, + { + "epoch": 3.8214004720692367, + "grad_norm": 0.2006047130461185, + "learning_rate": 2.950788167468934e-05, + "loss": 0.4391, + "step": 1212 + }, + { + "epoch": 3.824547600314713, + "grad_norm": 0.18827426151401724, + "learning_rate": 2.9491158583578753e-05, + "loss": 0.4358, + "step": 1213 + }, + { + "epoch": 3.8276947285601888, + "grad_norm": 0.19581009849077824, + "learning_rate": 2.947442766824963e-05, + "loss": 0.4441, + "step": 1214 + }, + { + "epoch": 3.830841856805665, + "grad_norm": 0.17734874484349197, + "learning_rate": 2.9457688946176746e-05, + "loss": 0.4274, + "step": 1215 + }, + { + "epoch": 3.833988985051141, + "grad_norm": 0.17277936701165808, + "learning_rate": 2.9440942434843042e-05, + "loss": 0.4367, + "step": 1216 + }, + { + "epoch": 3.8371361132966166, + "grad_norm": 0.18624753492705168, + "learning_rate": 2.942418815173958e-05, + "loss": 0.4431, + "step": 1217 + }, + { + "epoch": 3.840283241542093, + "grad_norm": 0.17981530092582268, + "learning_rate": 2.9407426114365538e-05, + "loss": 0.4488, + "step": 1218 + }, + { + "epoch": 3.8434303697875687, + "grad_norm": 0.17417936860740793, + "learning_rate": 2.9390656340228215e-05, + "loss": 0.4386, + "step": 1219 + }, + { + "epoch": 3.846577498033045, + "grad_norm": 0.1767282033703042, + "learning_rate": 2.9373878846842964e-05, + "loss": 0.4232, + "step": 1220 + }, + { + "epoch": 3.8497246262785207, + "grad_norm": 0.17726819566715343, + "learning_rate": 2.935709365173321e-05, + "loss": 0.4372, + "step": 1221 + }, + { + "epoch": 3.852871754523997, + "grad_norm": 0.18641228873224577, + "learning_rate": 2.934030077243044e-05, + "loss": 0.4539, + "step": 1222 + }, + { + "epoch": 3.856018882769473, + "grad_norm": 0.18523506965437314, + "learning_rate": 2.932350022647414e-05, + "loss": 0.44, + "step": 1223 + }, + { + "epoch": 3.859166011014949, + "grad_norm": 0.17429118934670704, + "learning_rate": 2.9306692031411817e-05, + "loss": 0.4419, + "step": 1224 + }, + { + "epoch": 3.862313139260425, + "grad_norm": 0.16912829692351863, + "learning_rate": 2.9289876204798973e-05, + "loss": 0.445, + "step": 1225 + }, + { + "epoch": 3.8654602675059007, + "grad_norm": 0.16689978761928095, + "learning_rate": 2.927305276419906e-05, + "loss": 0.4399, + "step": 1226 + }, + { + "epoch": 3.868607395751377, + "grad_norm": 0.1662616285588503, + "learning_rate": 2.9256221727183508e-05, + "loss": 0.4439, + "step": 1227 + }, + { + "epoch": 3.8717545239968527, + "grad_norm": 0.17547371330423203, + "learning_rate": 2.923938311133165e-05, + "loss": 0.4374, + "step": 1228 + }, + { + "epoch": 3.874901652242329, + "grad_norm": 0.16899679456608838, + "learning_rate": 2.922253693423078e-05, + "loss": 0.4403, + "step": 1229 + }, + { + "epoch": 3.8780487804878048, + "grad_norm": 0.17142815065151418, + "learning_rate": 2.920568321347604e-05, + "loss": 0.4491, + "step": 1230 + }, + { + "epoch": 3.881195908733281, + "grad_norm": 0.1659629352966584, + "learning_rate": 2.918882196667049e-05, + "loss": 0.4442, + "step": 1231 + }, + { + "epoch": 3.884343036978757, + "grad_norm": 0.1827137404665224, + "learning_rate": 2.9171953211425027e-05, + "loss": 0.4462, + "step": 1232 + }, + { + "epoch": 3.887490165224233, + "grad_norm": 0.16843932419439855, + "learning_rate": 2.9155076965358397e-05, + "loss": 0.4425, + "step": 1233 + }, + { + "epoch": 3.890637293469709, + "grad_norm": 0.1895572765071493, + "learning_rate": 2.9138193246097172e-05, + "loss": 0.4386, + "step": 1234 + }, + { + "epoch": 3.8937844217151847, + "grad_norm": 0.17922902546291508, + "learning_rate": 2.912130207127573e-05, + "loss": 0.4341, + "step": 1235 + }, + { + "epoch": 3.896931549960661, + "grad_norm": 0.17772714370132225, + "learning_rate": 2.9104403458536238e-05, + "loss": 0.4444, + "step": 1236 + }, + { + "epoch": 3.9000786782061367, + "grad_norm": 0.18719240088700048, + "learning_rate": 2.9087497425528618e-05, + "loss": 0.4329, + "step": 1237 + }, + { + "epoch": 3.903225806451613, + "grad_norm": 0.19116327201990352, + "learning_rate": 2.9070583989910556e-05, + "loss": 0.4393, + "step": 1238 + }, + { + "epoch": 3.906372934697089, + "grad_norm": 0.1888243799061614, + "learning_rate": 2.905366316934747e-05, + "loss": 0.4404, + "step": 1239 + }, + { + "epoch": 3.909520062942565, + "grad_norm": 0.19923427176985103, + "learning_rate": 2.9036734981512484e-05, + "loss": 0.4433, + "step": 1240 + }, + { + "epoch": 3.912667191188041, + "grad_norm": 0.184137878281804, + "learning_rate": 2.9019799444086425e-05, + "loss": 0.4451, + "step": 1241 + }, + { + "epoch": 3.915814319433517, + "grad_norm": 0.16318104017822477, + "learning_rate": 2.9002856574757777e-05, + "loss": 0.4459, + "step": 1242 + }, + { + "epoch": 3.918961447678993, + "grad_norm": 0.1834621256694157, + "learning_rate": 2.898590639122272e-05, + "loss": 0.4432, + "step": 1243 + }, + { + "epoch": 3.9221085759244687, + "grad_norm": 0.19023161460692964, + "learning_rate": 2.8968948911185018e-05, + "loss": 0.4411, + "step": 1244 + }, + { + "epoch": 3.925255704169945, + "grad_norm": 0.18356230097493653, + "learning_rate": 2.8951984152356117e-05, + "loss": 0.4365, + "step": 1245 + }, + { + "epoch": 3.9284028324154208, + "grad_norm": 0.19309532709059088, + "learning_rate": 2.8935012132455024e-05, + "loss": 0.4329, + "step": 1246 + }, + { + "epoch": 3.931549960660897, + "grad_norm": 0.1790999571642992, + "learning_rate": 2.8918032869208335e-05, + "loss": 0.44, + "step": 1247 + }, + { + "epoch": 3.934697088906373, + "grad_norm": 0.17778758227368407, + "learning_rate": 2.8901046380350227e-05, + "loss": 0.4369, + "step": 1248 + }, + { + "epoch": 3.937844217151849, + "grad_norm": 0.16705242650281665, + "learning_rate": 2.8884052683622408e-05, + "loss": 0.4416, + "step": 1249 + }, + { + "epoch": 3.940991345397325, + "grad_norm": 0.17118532687053076, + "learning_rate": 2.886705179677414e-05, + "loss": 0.4355, + "step": 1250 + }, + { + "epoch": 3.944138473642801, + "grad_norm": 0.17794134938829217, + "learning_rate": 2.885004373756215e-05, + "loss": 0.4362, + "step": 1251 + }, + { + "epoch": 3.947285601888277, + "grad_norm": 0.16850951168986514, + "learning_rate": 2.88330285237507e-05, + "loss": 0.439, + "step": 1252 + }, + { + "epoch": 3.9504327301337527, + "grad_norm": 0.17017397926948555, + "learning_rate": 2.8816006173111504e-05, + "loss": 0.4379, + "step": 1253 + }, + { + "epoch": 3.953579858379229, + "grad_norm": 0.1595665503643, + "learning_rate": 2.8798976703423726e-05, + "loss": 0.4416, + "step": 1254 + }, + { + "epoch": 3.9567269866247052, + "grad_norm": 0.17016824414153592, + "learning_rate": 2.8781940132473977e-05, + "loss": 0.437, + "step": 1255 + }, + { + "epoch": 3.959874114870181, + "grad_norm": 0.1768366345187553, + "learning_rate": 2.8764896478056287e-05, + "loss": 0.4405, + "step": 1256 + }, + { + "epoch": 3.963021243115657, + "grad_norm": 0.20215798109859642, + "learning_rate": 2.874784575797207e-05, + "loss": 0.4407, + "step": 1257 + }, + { + "epoch": 3.966168371361133, + "grad_norm": 0.19222521069705137, + "learning_rate": 2.8730787990030138e-05, + "loss": 0.4333, + "step": 1258 + }, + { + "epoch": 3.969315499606609, + "grad_norm": 0.1768154833619316, + "learning_rate": 2.8713723192046637e-05, + "loss": 0.4423, + "step": 1259 + }, + { + "epoch": 3.972462627852085, + "grad_norm": 0.183349858541064, + "learning_rate": 2.8696651381845094e-05, + "loss": 0.4443, + "step": 1260 + }, + { + "epoch": 3.975609756097561, + "grad_norm": 0.19201674367546054, + "learning_rate": 2.8679572577256324e-05, + "loss": 0.4362, + "step": 1261 + }, + { + "epoch": 3.9787568843430368, + "grad_norm": 0.18098429025124854, + "learning_rate": 2.866248679611846e-05, + "loss": 0.4339, + "step": 1262 + }, + { + "epoch": 3.981904012588513, + "grad_norm": 0.21698099039863375, + "learning_rate": 2.8645394056276936e-05, + "loss": 0.4356, + "step": 1263 + }, + { + "epoch": 3.9850511408339893, + "grad_norm": 0.19347949036380474, + "learning_rate": 2.862829437558443e-05, + "loss": 0.4435, + "step": 1264 + }, + { + "epoch": 3.988198269079465, + "grad_norm": 0.19634210846658384, + "learning_rate": 2.8611187771900897e-05, + "loss": 0.4359, + "step": 1265 + }, + { + "epoch": 3.991345397324941, + "grad_norm": 0.20372267012008688, + "learning_rate": 2.8594074263093495e-05, + "loss": 0.4435, + "step": 1266 + }, + { + "epoch": 3.994492525570417, + "grad_norm": 0.19018939442925267, + "learning_rate": 2.8576953867036605e-05, + "loss": 0.435, + "step": 1267 + }, + { + "epoch": 3.997639653815893, + "grad_norm": 0.17380970464724402, + "learning_rate": 2.855982660161181e-05, + "loss": 0.4368, + "step": 1268 + }, + { + "epoch": 4.003147128245476, + "grad_norm": 0.4567009200647522, + "learning_rate": 2.854269248470786e-05, + "loss": 0.8291, + "step": 1269 + }, + { + "epoch": 4.006294256490952, + "grad_norm": 0.3540594453182581, + "learning_rate": 2.8525551534220657e-05, + "loss": 0.3842, + "step": 1270 + }, + { + "epoch": 4.009441384736428, + "grad_norm": 0.3137640124559675, + "learning_rate": 2.8508403768053242e-05, + "loss": 0.3803, + "step": 1271 + }, + { + "epoch": 4.012588512981904, + "grad_norm": 0.3411489806848952, + "learning_rate": 2.8491249204115784e-05, + "loss": 0.3877, + "step": 1272 + }, + { + "epoch": 4.01573564122738, + "grad_norm": 0.32660510853569047, + "learning_rate": 2.847408786032555e-05, + "loss": 0.389, + "step": 1273 + }, + { + "epoch": 4.018882769472856, + "grad_norm": 0.3039688244842211, + "learning_rate": 2.845691975460688e-05, + "loss": 0.381, + "step": 1274 + }, + { + "epoch": 4.022029897718332, + "grad_norm": 0.2867459711491588, + "learning_rate": 2.8439744904891178e-05, + "loss": 0.3768, + "step": 1275 + }, + { + "epoch": 4.025177025963808, + "grad_norm": 0.2971765367481167, + "learning_rate": 2.8422563329116898e-05, + "loss": 0.3887, + "step": 1276 + }, + { + "epoch": 4.028324154209284, + "grad_norm": 0.26012973313503523, + "learning_rate": 2.8405375045229512e-05, + "loss": 0.3872, + "step": 1277 + }, + { + "epoch": 4.03147128245476, + "grad_norm": 0.24236515760372213, + "learning_rate": 2.83881800711815e-05, + "loss": 0.3911, + "step": 1278 + }, + { + "epoch": 4.034618410700236, + "grad_norm": 0.27142579496532293, + "learning_rate": 2.837097842493234e-05, + "loss": 0.3927, + "step": 1279 + }, + { + "epoch": 4.037765538945712, + "grad_norm": 0.24067712701294466, + "learning_rate": 2.8353770124448467e-05, + "loss": 0.3851, + "step": 1280 + }, + { + "epoch": 4.040912667191188, + "grad_norm": 0.2388177326791406, + "learning_rate": 2.8336555187703266e-05, + "loss": 0.377, + "step": 1281 + }, + { + "epoch": 4.044059795436664, + "grad_norm": 0.24190387059110036, + "learning_rate": 2.8319333632677062e-05, + "loss": 0.3819, + "step": 1282 + }, + { + "epoch": 4.04720692368214, + "grad_norm": 0.233811700600284, + "learning_rate": 2.830210547735708e-05, + "loss": 0.374, + "step": 1283 + }, + { + "epoch": 4.050354051927616, + "grad_norm": 0.21990274300198742, + "learning_rate": 2.8284870739737456e-05, + "loss": 0.3801, + "step": 1284 + }, + { + "epoch": 4.053501180173092, + "grad_norm": 0.24925704152570827, + "learning_rate": 2.826762943781918e-05, + "loss": 0.3833, + "step": 1285 + }, + { + "epoch": 4.056648308418568, + "grad_norm": 0.2281128058726442, + "learning_rate": 2.825038158961012e-05, + "loss": 0.3849, + "step": 1286 + }, + { + "epoch": 4.059795436664044, + "grad_norm": 0.20398518468216212, + "learning_rate": 2.823312721312496e-05, + "loss": 0.3749, + "step": 1287 + }, + { + "epoch": 4.06294256490952, + "grad_norm": 0.21358990395381772, + "learning_rate": 2.8215866326385222e-05, + "loss": 0.389, + "step": 1288 + }, + { + "epoch": 4.066089693154996, + "grad_norm": 0.1998290493946192, + "learning_rate": 2.8198598947419222e-05, + "loss": 0.3746, + "step": 1289 + }, + { + "epoch": 4.069236821400472, + "grad_norm": 0.2136797295751118, + "learning_rate": 2.818132509426204e-05, + "loss": 0.3873, + "step": 1290 + }, + { + "epoch": 4.072383949645948, + "grad_norm": 0.19695872962181996, + "learning_rate": 2.8164044784955536e-05, + "loss": 0.387, + "step": 1291 + }, + { + "epoch": 4.075531077891424, + "grad_norm": 0.23535924524823093, + "learning_rate": 2.814675803754831e-05, + "loss": 0.3875, + "step": 1292 + }, + { + "epoch": 4.0786782061369005, + "grad_norm": 0.20439185050875183, + "learning_rate": 2.8129464870095697e-05, + "loss": 0.3765, + "step": 1293 + }, + { + "epoch": 4.081825334382376, + "grad_norm": 0.22225008665143714, + "learning_rate": 2.8112165300659714e-05, + "loss": 0.3779, + "step": 1294 + }, + { + "epoch": 4.084972462627852, + "grad_norm": 0.21036944845424835, + "learning_rate": 2.809485934730907e-05, + "loss": 0.4008, + "step": 1295 + }, + { + "epoch": 4.088119590873328, + "grad_norm": 0.252462473354712, + "learning_rate": 2.807754702811916e-05, + "loss": 0.3867, + "step": 1296 + }, + { + "epoch": 4.091266719118804, + "grad_norm": 0.22970311838363114, + "learning_rate": 2.8060228361172012e-05, + "loss": 0.387, + "step": 1297 + }, + { + "epoch": 4.09441384736428, + "grad_norm": 0.19785518482856718, + "learning_rate": 2.804290336455629e-05, + "loss": 0.384, + "step": 1298 + }, + { + "epoch": 4.097560975609756, + "grad_norm": 0.2375083186949961, + "learning_rate": 2.8025572056367263e-05, + "loss": 0.3802, + "step": 1299 + }, + { + "epoch": 4.100708103855232, + "grad_norm": 0.20170571559199502, + "learning_rate": 2.8008234454706795e-05, + "loss": 0.378, + "step": 1300 + }, + { + "epoch": 4.103855232100708, + "grad_norm": 0.19637734454839414, + "learning_rate": 2.799089057768333e-05, + "loss": 0.3841, + "step": 1301 + }, + { + "epoch": 4.1070023603461845, + "grad_norm": 0.20324902703498704, + "learning_rate": 2.797354044341186e-05, + "loss": 0.389, + "step": 1302 + }, + { + "epoch": 4.11014948859166, + "grad_norm": 0.2011077455123792, + "learning_rate": 2.7956184070013912e-05, + "loss": 0.3813, + "step": 1303 + }, + { + "epoch": 4.113296616837136, + "grad_norm": 0.18553531787475813, + "learning_rate": 2.7938821475617523e-05, + "loss": 0.3829, + "step": 1304 + }, + { + "epoch": 4.116443745082612, + "grad_norm": 0.19878439370656475, + "learning_rate": 2.792145267835725e-05, + "loss": 0.3738, + "step": 1305 + }, + { + "epoch": 4.119590873328088, + "grad_norm": 0.19762338018115141, + "learning_rate": 2.7904077696374107e-05, + "loss": 0.3796, + "step": 1306 + }, + { + "epoch": 4.122738001573564, + "grad_norm": 0.21096214326769303, + "learning_rate": 2.7886696547815568e-05, + "loss": 0.3764, + "step": 1307 + }, + { + "epoch": 4.12588512981904, + "grad_norm": 0.19386489654228445, + "learning_rate": 2.7869309250835565e-05, + "loss": 0.3808, + "step": 1308 + }, + { + "epoch": 4.129032258064516, + "grad_norm": 0.22093269977952554, + "learning_rate": 2.7851915823594442e-05, + "loss": 0.3788, + "step": 1309 + }, + { + "epoch": 4.132179386309992, + "grad_norm": 0.2004090636824933, + "learning_rate": 2.783451628425893e-05, + "loss": 0.3789, + "step": 1310 + }, + { + "epoch": 4.1353265145554685, + "grad_norm": 0.198727713176911, + "learning_rate": 2.7817110651002183e-05, + "loss": 0.3818, + "step": 1311 + }, + { + "epoch": 4.138473642800944, + "grad_norm": 0.24944897390496926, + "learning_rate": 2.779969894200367e-05, + "loss": 0.3815, + "step": 1312 + }, + { + "epoch": 4.14162077104642, + "grad_norm": 0.17429094692844807, + "learning_rate": 2.7782281175449246e-05, + "loss": 0.3805, + "step": 1313 + }, + { + "epoch": 4.144767899291896, + "grad_norm": 0.2344196809712545, + "learning_rate": 2.7764857369531078e-05, + "loss": 0.3851, + "step": 1314 + }, + { + "epoch": 4.147915027537372, + "grad_norm": 0.1697952320993672, + "learning_rate": 2.774742754244764e-05, + "loss": 0.3833, + "step": 1315 + }, + { + "epoch": 4.151062155782848, + "grad_norm": 0.20972633292156506, + "learning_rate": 2.7729991712403697e-05, + "loss": 0.3841, + "step": 1316 + }, + { + "epoch": 4.154209284028324, + "grad_norm": 0.20826674826782807, + "learning_rate": 2.7712549897610284e-05, + "loss": 0.3873, + "step": 1317 + }, + { + "epoch": 4.1573564122738, + "grad_norm": 0.18808361463227552, + "learning_rate": 2.769510211628468e-05, + "loss": 0.3831, + "step": 1318 + }, + { + "epoch": 4.160503540519276, + "grad_norm": 0.20800615549370643, + "learning_rate": 2.767764838665041e-05, + "loss": 0.3785, + "step": 1319 + }, + { + "epoch": 4.1636506687647525, + "grad_norm": 0.22375396662721606, + "learning_rate": 2.766018872693719e-05, + "loss": 0.3835, + "step": 1320 + }, + { + "epoch": 4.166797797010228, + "grad_norm": 0.19240780126585705, + "learning_rate": 2.764272315538096e-05, + "loss": 0.3832, + "step": 1321 + }, + { + "epoch": 4.169944925255704, + "grad_norm": 0.2037920501367335, + "learning_rate": 2.762525169022381e-05, + "loss": 0.387, + "step": 1322 + }, + { + "epoch": 4.17309205350118, + "grad_norm": 0.1946112985945064, + "learning_rate": 2.7607774349713997e-05, + "loss": 0.3882, + "step": 1323 + }, + { + "epoch": 4.176239181746656, + "grad_norm": 0.19471855963610726, + "learning_rate": 2.7590291152105905e-05, + "loss": 0.3859, + "step": 1324 + }, + { + "epoch": 4.1793863099921325, + "grad_norm": 0.208895734249875, + "learning_rate": 2.7572802115660045e-05, + "loss": 0.3899, + "step": 1325 + }, + { + "epoch": 4.182533438237608, + "grad_norm": 0.19332682587345157, + "learning_rate": 2.7555307258643028e-05, + "loss": 0.3817, + "step": 1326 + }, + { + "epoch": 4.185680566483084, + "grad_norm": 0.20554112935607244, + "learning_rate": 2.753780659932753e-05, + "loss": 0.3892, + "step": 1327 + }, + { + "epoch": 4.18882769472856, + "grad_norm": 0.1863703929014815, + "learning_rate": 2.7520300155992296e-05, + "loss": 0.3989, + "step": 1328 + }, + { + "epoch": 4.191974822974037, + "grad_norm": 0.2181184280633318, + "learning_rate": 2.7502787946922125e-05, + "loss": 0.3857, + "step": 1329 + }, + { + "epoch": 4.195121951219512, + "grad_norm": 0.1877040187866126, + "learning_rate": 2.748526999040782e-05, + "loss": 0.3846, + "step": 1330 + }, + { + "epoch": 4.198269079464988, + "grad_norm": 0.2101059077905614, + "learning_rate": 2.7467746304746192e-05, + "loss": 0.3791, + "step": 1331 + }, + { + "epoch": 4.201416207710464, + "grad_norm": 0.20021777258381446, + "learning_rate": 2.7450216908240037e-05, + "loss": 0.3829, + "step": 1332 + }, + { + "epoch": 4.20456333595594, + "grad_norm": 0.2135816346994294, + "learning_rate": 2.7432681819198114e-05, + "loss": 0.385, + "step": 1333 + }, + { + "epoch": 4.2077104642014165, + "grad_norm": 0.18046740545279863, + "learning_rate": 2.7415141055935132e-05, + "loss": 0.3744, + "step": 1334 + }, + { + "epoch": 4.210857592446892, + "grad_norm": 0.20184956661048567, + "learning_rate": 2.739759463677172e-05, + "loss": 0.3773, + "step": 1335 + }, + { + "epoch": 4.214004720692368, + "grad_norm": 0.1683940922170243, + "learning_rate": 2.738004258003442e-05, + "loss": 0.3816, + "step": 1336 + }, + { + "epoch": 4.217151848937844, + "grad_norm": 0.178967099609675, + "learning_rate": 2.736248490405567e-05, + "loss": 0.3868, + "step": 1337 + }, + { + "epoch": 4.220298977183321, + "grad_norm": 0.17657160485024884, + "learning_rate": 2.7344921627173745e-05, + "loss": 0.3838, + "step": 1338 + }, + { + "epoch": 4.223446105428796, + "grad_norm": 0.1871577205312587, + "learning_rate": 2.732735276773282e-05, + "loss": 0.3852, + "step": 1339 + }, + { + "epoch": 4.226593233674272, + "grad_norm": 0.18868214707434067, + "learning_rate": 2.7309778344082853e-05, + "loss": 0.3897, + "step": 1340 + }, + { + "epoch": 4.229740361919748, + "grad_norm": 0.17848169409152329, + "learning_rate": 2.7292198374579637e-05, + "loss": 0.3841, + "step": 1341 + }, + { + "epoch": 4.232887490165224, + "grad_norm": 0.19858344201399433, + "learning_rate": 2.727461287758476e-05, + "loss": 0.3877, + "step": 1342 + }, + { + "epoch": 4.2360346184107005, + "grad_norm": 0.19726514845450016, + "learning_rate": 2.7257021871465566e-05, + "loss": 0.3838, + "step": 1343 + }, + { + "epoch": 4.239181746656176, + "grad_norm": 0.177223025272618, + "learning_rate": 2.723942537459518e-05, + "loss": 0.394, + "step": 1344 + }, + { + "epoch": 4.242328874901652, + "grad_norm": 0.2355427079374641, + "learning_rate": 2.7221823405352435e-05, + "loss": 0.3861, + "step": 1345 + }, + { + "epoch": 4.245476003147128, + "grad_norm": 0.20054623016987655, + "learning_rate": 2.72042159821219e-05, + "loss": 0.3888, + "step": 1346 + }, + { + "epoch": 4.248623131392605, + "grad_norm": 0.2041632465765231, + "learning_rate": 2.7186603123293824e-05, + "loss": 0.3795, + "step": 1347 + }, + { + "epoch": 4.25177025963808, + "grad_norm": 0.22641717412431214, + "learning_rate": 2.716898484726414e-05, + "loss": 0.3778, + "step": 1348 + }, + { + "epoch": 4.254917387883556, + "grad_norm": 0.1916050281481582, + "learning_rate": 2.7151361172434447e-05, + "loss": 0.3837, + "step": 1349 + }, + { + "epoch": 4.258064516129032, + "grad_norm": 0.24137246015980246, + "learning_rate": 2.713373211721196e-05, + "loss": 0.3862, + "step": 1350 + }, + { + "epoch": 4.261211644374509, + "grad_norm": 0.18174131822711345, + "learning_rate": 2.711609770000955e-05, + "loss": 0.3816, + "step": 1351 + }, + { + "epoch": 4.2643587726199845, + "grad_norm": 0.2242923413154568, + "learning_rate": 2.7098457939245654e-05, + "loss": 0.3872, + "step": 1352 + }, + { + "epoch": 4.26750590086546, + "grad_norm": 0.2235462568563175, + "learning_rate": 2.7080812853344304e-05, + "loss": 0.3996, + "step": 1353 + }, + { + "epoch": 4.270653029110936, + "grad_norm": 0.24969061397705838, + "learning_rate": 2.7063162460735103e-05, + "loss": 0.3816, + "step": 1354 + }, + { + "epoch": 4.273800157356412, + "grad_norm": 0.22715980857633994, + "learning_rate": 2.7045506779853186e-05, + "loss": 0.3852, + "step": 1355 + }, + { + "epoch": 4.276947285601889, + "grad_norm": 0.1822166185884246, + "learning_rate": 2.7027845829139202e-05, + "loss": 0.3803, + "step": 1356 + }, + { + "epoch": 4.280094413847364, + "grad_norm": 0.20932165509317502, + "learning_rate": 2.7010179627039318e-05, + "loss": 0.387, + "step": 1357 + }, + { + "epoch": 4.28324154209284, + "grad_norm": 0.20308647316174985, + "learning_rate": 2.699250819200519e-05, + "loss": 0.3864, + "step": 1358 + }, + { + "epoch": 4.286388670338316, + "grad_norm": 0.18217363195930258, + "learning_rate": 2.6974831542493923e-05, + "loss": 0.3802, + "step": 1359 + }, + { + "epoch": 4.289535798583792, + "grad_norm": 0.186444098060944, + "learning_rate": 2.6957149696968085e-05, + "loss": 0.3848, + "step": 1360 + }, + { + "epoch": 4.2926829268292686, + "grad_norm": 0.1844457489090321, + "learning_rate": 2.6939462673895663e-05, + "loss": 0.3812, + "step": 1361 + }, + { + "epoch": 4.295830055074744, + "grad_norm": 0.2010822546007924, + "learning_rate": 2.6921770491750044e-05, + "loss": 0.3897, + "step": 1362 + }, + { + "epoch": 4.29897718332022, + "grad_norm": 0.20222906345664107, + "learning_rate": 2.690407316901002e-05, + "loss": 0.3865, + "step": 1363 + }, + { + "epoch": 4.302124311565696, + "grad_norm": 0.18971869892805163, + "learning_rate": 2.6886370724159738e-05, + "loss": 0.3854, + "step": 1364 + }, + { + "epoch": 4.305271439811173, + "grad_norm": 0.1997246946094328, + "learning_rate": 2.686866317568871e-05, + "loss": 0.3868, + "step": 1365 + }, + { + "epoch": 4.3084185680566485, + "grad_norm": 0.18961656604722107, + "learning_rate": 2.685095054209176e-05, + "loss": 0.3904, + "step": 1366 + }, + { + "epoch": 4.311565696302124, + "grad_norm": 0.18958617985805862, + "learning_rate": 2.6833232841869038e-05, + "loss": 0.3832, + "step": 1367 + }, + { + "epoch": 4.3147128245476, + "grad_norm": 0.20597877250679947, + "learning_rate": 2.681551009352598e-05, + "loss": 0.3794, + "step": 1368 + }, + { + "epoch": 4.317859952793077, + "grad_norm": 0.2060368281881852, + "learning_rate": 2.679778231557329e-05, + "loss": 0.3845, + "step": 1369 + }, + { + "epoch": 4.321007081038553, + "grad_norm": 0.23435327637660347, + "learning_rate": 2.6780049526526934e-05, + "loss": 0.392, + "step": 1370 + }, + { + "epoch": 4.324154209284028, + "grad_norm": 0.19475763086784323, + "learning_rate": 2.6762311744908106e-05, + "loss": 0.387, + "step": 1371 + }, + { + "epoch": 4.327301337529504, + "grad_norm": 0.191113901147632, + "learning_rate": 2.674456898924322e-05, + "loss": 0.3873, + "step": 1372 + }, + { + "epoch": 4.33044846577498, + "grad_norm": 0.18932113213075516, + "learning_rate": 2.6726821278063878e-05, + "loss": 0.3815, + "step": 1373 + }, + { + "epoch": 4.333595594020457, + "grad_norm": 0.18886476158101362, + "learning_rate": 2.6709068629906867e-05, + "loss": 0.3826, + "step": 1374 + }, + { + "epoch": 4.3367427222659325, + "grad_norm": 0.19201665124650338, + "learning_rate": 2.669131106331412e-05, + "loss": 0.3926, + "step": 1375 + }, + { + "epoch": 4.339889850511408, + "grad_norm": 0.20585223962817667, + "learning_rate": 2.667354859683272e-05, + "loss": 0.3902, + "step": 1376 + }, + { + "epoch": 4.343036978756884, + "grad_norm": 0.18087305906555123, + "learning_rate": 2.6655781249014843e-05, + "loss": 0.3946, + "step": 1377 + }, + { + "epoch": 4.34618410700236, + "grad_norm": 0.22199132776028002, + "learning_rate": 2.6638009038417792e-05, + "loss": 0.3883, + "step": 1378 + }, + { + "epoch": 4.349331235247837, + "grad_norm": 0.20789032314071595, + "learning_rate": 2.662023198360394e-05, + "loss": 0.3863, + "step": 1379 + }, + { + "epoch": 4.352478363493312, + "grad_norm": 0.19784286066300738, + "learning_rate": 2.6602450103140713e-05, + "loss": 0.3964, + "step": 1380 + }, + { + "epoch": 4.355625491738788, + "grad_norm": 0.24791934452491476, + "learning_rate": 2.6584663415600583e-05, + "loss": 0.3862, + "step": 1381 + }, + { + "epoch": 4.358772619984264, + "grad_norm": 0.19522453242970436, + "learning_rate": 2.656687193956104e-05, + "loss": 0.3907, + "step": 1382 + }, + { + "epoch": 4.361919748229741, + "grad_norm": 0.21752375333467266, + "learning_rate": 2.6549075693604575e-05, + "loss": 0.3864, + "step": 1383 + }, + { + "epoch": 4.3650668764752165, + "grad_norm": 0.20160529341364714, + "learning_rate": 2.6531274696318664e-05, + "loss": 0.3965, + "step": 1384 + }, + { + "epoch": 4.368214004720692, + "grad_norm": 0.18568303741674552, + "learning_rate": 2.6513468966295737e-05, + "loss": 0.3885, + "step": 1385 + }, + { + "epoch": 4.371361132966168, + "grad_norm": 0.2243222479567149, + "learning_rate": 2.649565852213318e-05, + "loss": 0.3868, + "step": 1386 + }, + { + "epoch": 4.374508261211645, + "grad_norm": 0.19214945590700291, + "learning_rate": 2.6477843382433302e-05, + "loss": 0.3911, + "step": 1387 + }, + { + "epoch": 4.377655389457121, + "grad_norm": 0.20258061934369762, + "learning_rate": 2.6460023565803305e-05, + "loss": 0.3823, + "step": 1388 + }, + { + "epoch": 4.380802517702596, + "grad_norm": 0.2124917879318387, + "learning_rate": 2.644219909085528e-05, + "loss": 0.386, + "step": 1389 + }, + { + "epoch": 4.383949645948072, + "grad_norm": 0.1907323815413866, + "learning_rate": 2.642436997620619e-05, + "loss": 0.3912, + "step": 1390 + }, + { + "epoch": 4.387096774193548, + "grad_norm": 0.201912862490839, + "learning_rate": 2.6406536240477835e-05, + "loss": 0.3869, + "step": 1391 + }, + { + "epoch": 4.390243902439025, + "grad_norm": 0.1897846042537431, + "learning_rate": 2.6388697902296848e-05, + "loss": 0.3836, + "step": 1392 + }, + { + "epoch": 4.3933910306845005, + "grad_norm": 0.22784186892176736, + "learning_rate": 2.637085498029467e-05, + "loss": 0.3838, + "step": 1393 + }, + { + "epoch": 4.396538158929976, + "grad_norm": 0.20376206510573175, + "learning_rate": 2.6353007493107517e-05, + "loss": 0.3942, + "step": 1394 + }, + { + "epoch": 4.399685287175452, + "grad_norm": 0.19112546192154864, + "learning_rate": 2.6335155459376395e-05, + "loss": 0.3978, + "step": 1395 + }, + { + "epoch": 4.402832415420928, + "grad_norm": 0.20554479838475062, + "learning_rate": 2.6317298897747033e-05, + "loss": 0.3971, + "step": 1396 + }, + { + "epoch": 4.405979543666405, + "grad_norm": 0.19449262983527554, + "learning_rate": 2.6299437826869923e-05, + "loss": 0.3815, + "step": 1397 + }, + { + "epoch": 4.4091266719118805, + "grad_norm": 0.22156499828526832, + "learning_rate": 2.6281572265400223e-05, + "loss": 0.3866, + "step": 1398 + }, + { + "epoch": 4.412273800157356, + "grad_norm": 0.22531435072563047, + "learning_rate": 2.6263702231997824e-05, + "loss": 0.3807, + "step": 1399 + }, + { + "epoch": 4.415420928402832, + "grad_norm": 0.1879250329188261, + "learning_rate": 2.624582774532725e-05, + "loss": 0.3943, + "step": 1400 + }, + { + "epoch": 4.418568056648309, + "grad_norm": 0.2321072669481776, + "learning_rate": 2.6227948824057712e-05, + "loss": 0.3808, + "step": 1401 + }, + { + "epoch": 4.421715184893785, + "grad_norm": 0.19421971388680198, + "learning_rate": 2.6210065486863018e-05, + "loss": 0.3868, + "step": 1402 + }, + { + "epoch": 4.42486231313926, + "grad_norm": 0.23006793771084283, + "learning_rate": 2.6192177752421627e-05, + "loss": 0.3942, + "step": 1403 + }, + { + "epoch": 4.428009441384736, + "grad_norm": 0.253444666264355, + "learning_rate": 2.617428563941655e-05, + "loss": 0.3833, + "step": 1404 + }, + { + "epoch": 4.431156569630213, + "grad_norm": 0.21355125032513483, + "learning_rate": 2.61563891665354e-05, + "loss": 0.3897, + "step": 1405 + }, + { + "epoch": 4.434303697875689, + "grad_norm": 0.19507808857241118, + "learning_rate": 2.613848835247033e-05, + "loss": 0.3825, + "step": 1406 + }, + { + "epoch": 4.4374508261211645, + "grad_norm": 0.22181377156510734, + "learning_rate": 2.6120583215918038e-05, + "loss": 0.3944, + "step": 1407 + }, + { + "epoch": 4.44059795436664, + "grad_norm": 0.17048806847005354, + "learning_rate": 2.6102673775579724e-05, + "loss": 0.3915, + "step": 1408 + }, + { + "epoch": 4.443745082612116, + "grad_norm": 0.2068835428974255, + "learning_rate": 2.6084760050161097e-05, + "loss": 0.3854, + "step": 1409 + }, + { + "epoch": 4.446892210857593, + "grad_norm": 0.2207189101858155, + "learning_rate": 2.606684205837232e-05, + "loss": 0.3831, + "step": 1410 + }, + { + "epoch": 4.450039339103069, + "grad_norm": 0.18810634531927864, + "learning_rate": 2.6048919818928034e-05, + "loss": 0.3791, + "step": 1411 + }, + { + "epoch": 4.453186467348544, + "grad_norm": 0.20997580291783224, + "learning_rate": 2.6030993350547316e-05, + "loss": 0.3886, + "step": 1412 + }, + { + "epoch": 4.45633359559402, + "grad_norm": 0.18183918340442795, + "learning_rate": 2.6013062671953645e-05, + "loss": 0.3861, + "step": 1413 + }, + { + "epoch": 4.459480723839496, + "grad_norm": 0.1991423678980766, + "learning_rate": 2.59951278018749e-05, + "loss": 0.3867, + "step": 1414 + }, + { + "epoch": 4.462627852084973, + "grad_norm": 0.2161981116208714, + "learning_rate": 2.597718875904335e-05, + "loss": 0.393, + "step": 1415 + }, + { + "epoch": 4.4657749803304485, + "grad_norm": 0.1851824205661574, + "learning_rate": 2.5959245562195615e-05, + "loss": 0.3883, + "step": 1416 + }, + { + "epoch": 4.468922108575924, + "grad_norm": 0.20523032246874873, + "learning_rate": 2.594129823007265e-05, + "loss": 0.3949, + "step": 1417 + }, + { + "epoch": 4.4720692368214, + "grad_norm": 0.22841297022323917, + "learning_rate": 2.592334678141973e-05, + "loss": 0.3896, + "step": 1418 + }, + { + "epoch": 4.475216365066877, + "grad_norm": 0.20597518241097829, + "learning_rate": 2.5905391234986445e-05, + "loss": 0.3967, + "step": 1419 + }, + { + "epoch": 4.478363493312353, + "grad_norm": 0.200755938384984, + "learning_rate": 2.5887431609526637e-05, + "loss": 0.382, + "step": 1420 + }, + { + "epoch": 4.481510621557828, + "grad_norm": 0.23257612111660253, + "learning_rate": 2.586946792379844e-05, + "loss": 0.3903, + "step": 1421 + }, + { + "epoch": 4.484657749803304, + "grad_norm": 0.2071736737546111, + "learning_rate": 2.585150019656419e-05, + "loss": 0.3865, + "step": 1422 + }, + { + "epoch": 4.487804878048781, + "grad_norm": 0.18980283295679137, + "learning_rate": 2.5833528446590494e-05, + "loss": 0.3876, + "step": 1423 + }, + { + "epoch": 4.490952006294257, + "grad_norm": 0.20509626159829664, + "learning_rate": 2.581555269264811e-05, + "loss": 0.3858, + "step": 1424 + }, + { + "epoch": 4.4940991345397325, + "grad_norm": 0.1972045387860757, + "learning_rate": 2.5797572953512014e-05, + "loss": 0.3897, + "step": 1425 + }, + { + "epoch": 4.497246262785208, + "grad_norm": 0.21509560076923515, + "learning_rate": 2.5779589247961326e-05, + "loss": 0.3904, + "step": 1426 + }, + { + "epoch": 4.500393391030684, + "grad_norm": 0.19228646717933973, + "learning_rate": 2.576160159477932e-05, + "loss": 0.3918, + "step": 1427 + }, + { + "epoch": 4.503540519276161, + "grad_norm": 0.1795849927351903, + "learning_rate": 2.5743610012753375e-05, + "loss": 0.3953, + "step": 1428 + }, + { + "epoch": 4.506687647521637, + "grad_norm": 0.20489830860011532, + "learning_rate": 2.5725614520675003e-05, + "loss": 0.3919, + "step": 1429 + }, + { + "epoch": 4.5098347757671124, + "grad_norm": 0.21882006418429392, + "learning_rate": 2.5707615137339774e-05, + "loss": 0.3938, + "step": 1430 + }, + { + "epoch": 4.512981904012588, + "grad_norm": 0.19280187229135973, + "learning_rate": 2.5689611881547333e-05, + "loss": 0.3851, + "step": 1431 + }, + { + "epoch": 4.516129032258064, + "grad_norm": 0.20929709845343097, + "learning_rate": 2.5671604772101364e-05, + "loss": 0.3869, + "step": 1432 + }, + { + "epoch": 4.519276160503541, + "grad_norm": 0.19034278382916123, + "learning_rate": 2.565359382780959e-05, + "loss": 0.3892, + "step": 1433 + }, + { + "epoch": 4.522423288749017, + "grad_norm": 0.1955253897676763, + "learning_rate": 2.5635579067483716e-05, + "loss": 0.3948, + "step": 1434 + }, + { + "epoch": 4.525570416994492, + "grad_norm": 0.21006893895734358, + "learning_rate": 2.5617560509939453e-05, + "loss": 0.3902, + "step": 1435 + }, + { + "epoch": 4.528717545239968, + "grad_norm": 0.18703258921272223, + "learning_rate": 2.5599538173996466e-05, + "loss": 0.3945, + "step": 1436 + }, + { + "epoch": 4.531864673485445, + "grad_norm": 0.17700701860895593, + "learning_rate": 2.5581512078478384e-05, + "loss": 0.3872, + "step": 1437 + }, + { + "epoch": 4.535011801730921, + "grad_norm": 0.1904737922390524, + "learning_rate": 2.5563482242212735e-05, + "loss": 0.3918, + "step": 1438 + }, + { + "epoch": 4.5381589299763965, + "grad_norm": 0.19827598790350312, + "learning_rate": 2.554544868403098e-05, + "loss": 0.3936, + "step": 1439 + }, + { + "epoch": 4.541306058221872, + "grad_norm": 0.16151435086680768, + "learning_rate": 2.5527411422768454e-05, + "loss": 0.3915, + "step": 1440 + }, + { + "epoch": 4.544453186467349, + "grad_norm": 0.18657335720318344, + "learning_rate": 2.5509370477264358e-05, + "loss": 0.3919, + "step": 1441 + }, + { + "epoch": 4.547600314712825, + "grad_norm": 0.17941003140458014, + "learning_rate": 2.5491325866361737e-05, + "loss": 0.3876, + "step": 1442 + }, + { + "epoch": 4.550747442958301, + "grad_norm": 0.17734405961341654, + "learning_rate": 2.547327760890749e-05, + "loss": 0.3982, + "step": 1443 + }, + { + "epoch": 4.553894571203776, + "grad_norm": 0.17524164503141434, + "learning_rate": 2.5455225723752308e-05, + "loss": 0.3858, + "step": 1444 + }, + { + "epoch": 4.557041699449252, + "grad_norm": 0.17166835200788202, + "learning_rate": 2.5437170229750655e-05, + "loss": 0.3926, + "step": 1445 + }, + { + "epoch": 4.560188827694729, + "grad_norm": 0.17236320310568598, + "learning_rate": 2.541911114576079e-05, + "loss": 0.3917, + "step": 1446 + }, + { + "epoch": 4.563335955940205, + "grad_norm": 0.16586917527643763, + "learning_rate": 2.5401048490644713e-05, + "loss": 0.3905, + "step": 1447 + }, + { + "epoch": 4.5664830841856805, + "grad_norm": 0.1749389748686086, + "learning_rate": 2.538298228326814e-05, + "loss": 0.3943, + "step": 1448 + }, + { + "epoch": 4.569630212431156, + "grad_norm": 0.18445523358454083, + "learning_rate": 2.536491254250052e-05, + "loss": 0.3809, + "step": 1449 + }, + { + "epoch": 4.572777340676632, + "grad_norm": 0.18404046799235896, + "learning_rate": 2.534683928721498e-05, + "loss": 0.3937, + "step": 1450 + }, + { + "epoch": 4.575924468922109, + "grad_norm": 0.17788876905444853, + "learning_rate": 2.532876253628831e-05, + "loss": 0.3835, + "step": 1451 + }, + { + "epoch": 4.579071597167585, + "grad_norm": 0.18602157578478964, + "learning_rate": 2.5310682308600976e-05, + "loss": 0.3943, + "step": 1452 + }, + { + "epoch": 4.58221872541306, + "grad_norm": 0.17232298418778785, + "learning_rate": 2.5292598623037057e-05, + "loss": 0.3851, + "step": 1453 + }, + { + "epoch": 4.585365853658536, + "grad_norm": 0.1905608180973461, + "learning_rate": 2.5274511498484236e-05, + "loss": 0.3826, + "step": 1454 + }, + { + "epoch": 4.588512981904013, + "grad_norm": 0.17927303844283918, + "learning_rate": 2.5256420953833813e-05, + "loss": 0.3817, + "step": 1455 + }, + { + "epoch": 4.591660110149489, + "grad_norm": 0.20088651754247414, + "learning_rate": 2.5238327007980635e-05, + "loss": 0.3862, + "step": 1456 + }, + { + "epoch": 4.5948072383949645, + "grad_norm": 0.1800905518727353, + "learning_rate": 2.5220229679823113e-05, + "loss": 0.3935, + "step": 1457 + }, + { + "epoch": 4.59795436664044, + "grad_norm": 0.19383388730097495, + "learning_rate": 2.5202128988263183e-05, + "loss": 0.4014, + "step": 1458 + }, + { + "epoch": 4.601101494885917, + "grad_norm": 0.20016364463516104, + "learning_rate": 2.5184024952206315e-05, + "loss": 0.3904, + "step": 1459 + }, + { + "epoch": 4.604248623131393, + "grad_norm": 0.19534288874997346, + "learning_rate": 2.5165917590561453e-05, + "loss": 0.3884, + "step": 1460 + }, + { + "epoch": 4.607395751376869, + "grad_norm": 0.19845903664537287, + "learning_rate": 2.514780692224102e-05, + "loss": 0.3886, + "step": 1461 + }, + { + "epoch": 4.610542879622344, + "grad_norm": 0.18935447457983023, + "learning_rate": 2.5129692966160887e-05, + "loss": 0.3847, + "step": 1462 + }, + { + "epoch": 4.61369000786782, + "grad_norm": 0.1825215910021681, + "learning_rate": 2.511157574124037e-05, + "loss": 0.396, + "step": 1463 + }, + { + "epoch": 4.616837136113297, + "grad_norm": 0.1912881105818978, + "learning_rate": 2.5093455266402185e-05, + "loss": 0.3891, + "step": 1464 + }, + { + "epoch": 4.619984264358773, + "grad_norm": 0.18458695403611322, + "learning_rate": 2.507533156057246e-05, + "loss": 0.3951, + "step": 1465 + }, + { + "epoch": 4.6231313926042485, + "grad_norm": 0.17916510574048766, + "learning_rate": 2.5057204642680684e-05, + "loss": 0.3915, + "step": 1466 + }, + { + "epoch": 4.626278520849724, + "grad_norm": 0.17110403262888976, + "learning_rate": 2.50390745316597e-05, + "loss": 0.3845, + "step": 1467 + }, + { + "epoch": 4.6294256490952, + "grad_norm": 0.17903953256524813, + "learning_rate": 2.50209412464457e-05, + "loss": 0.383, + "step": 1468 + }, + { + "epoch": 4.632572777340677, + "grad_norm": 0.17999296179047053, + "learning_rate": 2.5002804805978177e-05, + "loss": 0.3944, + "step": 1469 + }, + { + "epoch": 4.635719905586153, + "grad_norm": 0.17581481498146168, + "learning_rate": 2.498466522919993e-05, + "loss": 0.3892, + "step": 1470 + }, + { + "epoch": 4.6388670338316285, + "grad_norm": 0.1783825034649337, + "learning_rate": 2.4966522535057024e-05, + "loss": 0.3891, + "step": 1471 + }, + { + "epoch": 4.642014162077104, + "grad_norm": 0.18377927440718408, + "learning_rate": 2.494837674249878e-05, + "loss": 0.3903, + "step": 1472 + }, + { + "epoch": 4.645161290322581, + "grad_norm": 0.18349514371989203, + "learning_rate": 2.4930227870477773e-05, + "loss": 0.3902, + "step": 1473 + }, + { + "epoch": 4.648308418568057, + "grad_norm": 0.1763906966839621, + "learning_rate": 2.491207593794977e-05, + "loss": 0.3857, + "step": 1474 + }, + { + "epoch": 4.651455546813533, + "grad_norm": 0.17906752178646956, + "learning_rate": 2.4893920963873746e-05, + "loss": 0.3908, + "step": 1475 + }, + { + "epoch": 4.654602675059008, + "grad_norm": 0.1679211823950241, + "learning_rate": 2.487576296721186e-05, + "loss": 0.3955, + "step": 1476 + }, + { + "epoch": 4.657749803304485, + "grad_norm": 0.1755347291395844, + "learning_rate": 2.485760196692942e-05, + "loss": 0.3916, + "step": 1477 + }, + { + "epoch": 4.660896931549961, + "grad_norm": 0.16465166030319364, + "learning_rate": 2.4839437981994867e-05, + "loss": 0.3903, + "step": 1478 + }, + { + "epoch": 4.664044059795437, + "grad_norm": 0.17776543423913058, + "learning_rate": 2.4821271031379765e-05, + "loss": 0.394, + "step": 1479 + }, + { + "epoch": 4.6671911880409125, + "grad_norm": 0.1781942266460176, + "learning_rate": 2.4803101134058775e-05, + "loss": 0.395, + "step": 1480 + }, + { + "epoch": 4.670338316286388, + "grad_norm": 0.17946693783829906, + "learning_rate": 2.478492830900964e-05, + "loss": 0.394, + "step": 1481 + }, + { + "epoch": 4.673485444531865, + "grad_norm": 0.1919149254694885, + "learning_rate": 2.4766752575213146e-05, + "loss": 0.3904, + "step": 1482 + }, + { + "epoch": 4.676632572777341, + "grad_norm": 0.16955030838772125, + "learning_rate": 2.4748573951653132e-05, + "loss": 0.388, + "step": 1483 + }, + { + "epoch": 4.679779701022817, + "grad_norm": 0.2002044781358719, + "learning_rate": 2.473039245731646e-05, + "loss": 0.3934, + "step": 1484 + }, + { + "epoch": 4.682926829268292, + "grad_norm": 0.17693479469518242, + "learning_rate": 2.4712208111192965e-05, + "loss": 0.3908, + "step": 1485 + }, + { + "epoch": 4.686073957513768, + "grad_norm": 0.19217906289741862, + "learning_rate": 2.4694020932275483e-05, + "loss": 0.3816, + "step": 1486 + }, + { + "epoch": 4.689221085759245, + "grad_norm": 0.1818600906270269, + "learning_rate": 2.467583093955981e-05, + "loss": 0.3894, + "step": 1487 + }, + { + "epoch": 4.692368214004721, + "grad_norm": 0.2030839509068234, + "learning_rate": 2.4657638152044667e-05, + "loss": 0.3868, + "step": 1488 + }, + { + "epoch": 4.6955153422501965, + "grad_norm": 0.18302211645178032, + "learning_rate": 2.4639442588731695e-05, + "loss": 0.3894, + "step": 1489 + }, + { + "epoch": 4.698662470495672, + "grad_norm": 0.17993702783679505, + "learning_rate": 2.4621244268625448e-05, + "loss": 0.393, + "step": 1490 + }, + { + "epoch": 4.701809598741149, + "grad_norm": 0.1836591030041654, + "learning_rate": 2.4603043210733343e-05, + "loss": 0.3936, + "step": 1491 + }, + { + "epoch": 4.704956726986625, + "grad_norm": 0.17969588856182217, + "learning_rate": 2.4584839434065675e-05, + "loss": 0.3896, + "step": 1492 + }, + { + "epoch": 4.708103855232101, + "grad_norm": 0.18627363426302215, + "learning_rate": 2.4566632957635555e-05, + "loss": 0.3963, + "step": 1493 + }, + { + "epoch": 4.711250983477576, + "grad_norm": 0.16348207104757354, + "learning_rate": 2.454842380045894e-05, + "loss": 0.38, + "step": 1494 + }, + { + "epoch": 4.714398111723053, + "grad_norm": 0.1932898447384366, + "learning_rate": 2.453021198155456e-05, + "loss": 0.3915, + "step": 1495 + }, + { + "epoch": 4.717545239968529, + "grad_norm": 0.17318365673903224, + "learning_rate": 2.451199751994395e-05, + "loss": 0.3942, + "step": 1496 + }, + { + "epoch": 4.720692368214005, + "grad_norm": 0.18679178875572805, + "learning_rate": 2.449378043465139e-05, + "loss": 0.3916, + "step": 1497 + }, + { + "epoch": 4.7238394964594805, + "grad_norm": 0.18301203217504775, + "learning_rate": 2.44755607447039e-05, + "loss": 0.3958, + "step": 1498 + }, + { + "epoch": 4.726986624704956, + "grad_norm": 0.18206504927748668, + "learning_rate": 2.4457338469131235e-05, + "loss": 0.3935, + "step": 1499 + }, + { + "epoch": 4.730133752950433, + "grad_norm": 0.19626381276499383, + "learning_rate": 2.4439113626965832e-05, + "loss": 0.3921, + "step": 1500 + }, + { + "epoch": 4.733280881195909, + "grad_norm": 0.209994269745939, + "learning_rate": 2.4420886237242812e-05, + "loss": 0.3896, + "step": 1501 + }, + { + "epoch": 4.736428009441385, + "grad_norm": 0.20966234827738384, + "learning_rate": 2.440265631899998e-05, + "loss": 0.3872, + "step": 1502 + }, + { + "epoch": 4.7395751376868605, + "grad_norm": 0.17956697728831314, + "learning_rate": 2.438442389127775e-05, + "loss": 0.3905, + "step": 1503 + }, + { + "epoch": 4.742722265932336, + "grad_norm": 0.230149257151023, + "learning_rate": 2.4366188973119173e-05, + "loss": 0.3942, + "step": 1504 + }, + { + "epoch": 4.745869394177813, + "grad_norm": 0.16446551541896534, + "learning_rate": 2.43479515835699e-05, + "loss": 0.3971, + "step": 1505 + }, + { + "epoch": 4.749016522423289, + "grad_norm": 0.21606059095007257, + "learning_rate": 2.4329711741678158e-05, + "loss": 0.3971, + "step": 1506 + }, + { + "epoch": 4.752163650668765, + "grad_norm": 0.18761423316527837, + "learning_rate": 2.4311469466494747e-05, + "loss": 0.3822, + "step": 1507 + }, + { + "epoch": 4.755310778914241, + "grad_norm": 0.20024056830304296, + "learning_rate": 2.429322477707299e-05, + "loss": 0.394, + "step": 1508 + }, + { + "epoch": 4.758457907159717, + "grad_norm": 0.20562507776072017, + "learning_rate": 2.4274977692468765e-05, + "loss": 0.3895, + "step": 1509 + }, + { + "epoch": 4.761605035405193, + "grad_norm": 0.1760613692982041, + "learning_rate": 2.4256728231740406e-05, + "loss": 0.3999, + "step": 1510 + }, + { + "epoch": 4.764752163650669, + "grad_norm": 0.22523629907709936, + "learning_rate": 2.423847641394877e-05, + "loss": 0.3881, + "step": 1511 + }, + { + "epoch": 4.7678992918961445, + "grad_norm": 0.18831716496370055, + "learning_rate": 2.422022225815714e-05, + "loss": 0.394, + "step": 1512 + }, + { + "epoch": 4.771046420141621, + "grad_norm": 0.22513412157385423, + "learning_rate": 2.4201965783431267e-05, + "loss": 0.3875, + "step": 1513 + }, + { + "epoch": 4.774193548387097, + "grad_norm": 0.21031494310694007, + "learning_rate": 2.4183707008839323e-05, + "loss": 0.3775, + "step": 1514 + }, + { + "epoch": 4.777340676632573, + "grad_norm": 0.21590978367024655, + "learning_rate": 2.4165445953451867e-05, + "loss": 0.3899, + "step": 1515 + }, + { + "epoch": 4.780487804878049, + "grad_norm": 0.22564896784384017, + "learning_rate": 2.414718263634185e-05, + "loss": 0.3913, + "step": 1516 + }, + { + "epoch": 4.783634933123524, + "grad_norm": 0.19020615700820612, + "learning_rate": 2.4128917076584587e-05, + "loss": 0.3944, + "step": 1517 + }, + { + "epoch": 4.786782061369001, + "grad_norm": 0.22953746850079004, + "learning_rate": 2.4110649293257728e-05, + "loss": 0.3986, + "step": 1518 + }, + { + "epoch": 4.789929189614477, + "grad_norm": 0.1805102124166518, + "learning_rate": 2.4092379305441252e-05, + "loss": 0.3898, + "step": 1519 + }, + { + "epoch": 4.793076317859953, + "grad_norm": 0.2268972739180654, + "learning_rate": 2.407410713221743e-05, + "loss": 0.3938, + "step": 1520 + }, + { + "epoch": 4.7962234461054285, + "grad_norm": 0.19050233625931567, + "learning_rate": 2.4055832792670842e-05, + "loss": 0.3924, + "step": 1521 + }, + { + "epoch": 4.799370574350904, + "grad_norm": 0.1922960780024949, + "learning_rate": 2.4037556305888288e-05, + "loss": 0.3813, + "step": 1522 + }, + { + "epoch": 4.802517702596381, + "grad_norm": 0.1898465812093314, + "learning_rate": 2.4019277690958856e-05, + "loss": 0.3939, + "step": 1523 + }, + { + "epoch": 4.805664830841857, + "grad_norm": 0.18381007690254716, + "learning_rate": 2.4000996966973817e-05, + "loss": 0.394, + "step": 1524 + }, + { + "epoch": 4.808811959087333, + "grad_norm": 0.2103100782295867, + "learning_rate": 2.398271415302668e-05, + "loss": 0.3897, + "step": 1525 + }, + { + "epoch": 4.811959087332809, + "grad_norm": 0.16769183783145522, + "learning_rate": 2.3964429268213115e-05, + "loss": 0.3972, + "step": 1526 + }, + { + "epoch": 4.815106215578285, + "grad_norm": 0.19945262652056686, + "learning_rate": 2.3946142331630955e-05, + "loss": 0.3941, + "step": 1527 + }, + { + "epoch": 4.818253343823761, + "grad_norm": 0.1700848589131017, + "learning_rate": 2.392785336238019e-05, + "loss": 0.3902, + "step": 1528 + }, + { + "epoch": 4.821400472069237, + "grad_norm": 0.1656556368956296, + "learning_rate": 2.390956237956291e-05, + "loss": 0.3933, + "step": 1529 + }, + { + "epoch": 4.8245476003147125, + "grad_norm": 0.18373049425460136, + "learning_rate": 2.389126940228333e-05, + "loss": 0.3956, + "step": 1530 + }, + { + "epoch": 4.827694728560189, + "grad_norm": 0.17964107945591998, + "learning_rate": 2.387297444964775e-05, + "loss": 0.3871, + "step": 1531 + }, + { + "epoch": 4.830841856805665, + "grad_norm": 0.17450741916661142, + "learning_rate": 2.385467754076451e-05, + "loss": 0.3788, + "step": 1532 + }, + { + "epoch": 4.833988985051141, + "grad_norm": 0.18365024872291147, + "learning_rate": 2.3836378694744014e-05, + "loss": 0.3986, + "step": 1533 + }, + { + "epoch": 4.837136113296617, + "grad_norm": 0.18009519290000942, + "learning_rate": 2.3818077930698683e-05, + "loss": 0.4009, + "step": 1534 + }, + { + "epoch": 4.840283241542092, + "grad_norm": 0.1962147329229101, + "learning_rate": 2.3799775267742934e-05, + "loss": 0.3919, + "step": 1535 + }, + { + "epoch": 4.843430369787569, + "grad_norm": 0.1814959883127099, + "learning_rate": 2.3781470724993186e-05, + "loss": 0.3894, + "step": 1536 + }, + { + "epoch": 4.846577498033045, + "grad_norm": 0.19522141468087864, + "learning_rate": 2.376316432156779e-05, + "loss": 0.3915, + "step": 1537 + }, + { + "epoch": 4.849724626278521, + "grad_norm": 0.1824952868828727, + "learning_rate": 2.3744856076587076e-05, + "loss": 0.396, + "step": 1538 + }, + { + "epoch": 4.8528717545239966, + "grad_norm": 0.19970076246737523, + "learning_rate": 2.3726546009173275e-05, + "loss": 0.3975, + "step": 1539 + }, + { + "epoch": 4.856018882769473, + "grad_norm": 0.1788577313593828, + "learning_rate": 2.3708234138450518e-05, + "loss": 0.3888, + "step": 1540 + }, + { + "epoch": 4.859166011014949, + "grad_norm": 0.2006715626887735, + "learning_rate": 2.368992048354485e-05, + "loss": 0.3904, + "step": 1541 + }, + { + "epoch": 4.862313139260425, + "grad_norm": 0.1655853950892595, + "learning_rate": 2.3671605063584147e-05, + "loss": 0.3917, + "step": 1542 + }, + { + "epoch": 4.865460267505901, + "grad_norm": 0.20564415582976606, + "learning_rate": 2.3653287897698135e-05, + "loss": 0.3935, + "step": 1543 + }, + { + "epoch": 4.868607395751377, + "grad_norm": 0.16818135554428862, + "learning_rate": 2.3634969005018377e-05, + "loss": 0.39, + "step": 1544 + }, + { + "epoch": 4.871754523996853, + "grad_norm": 0.17762234599652035, + "learning_rate": 2.361664840467823e-05, + "loss": 0.3926, + "step": 1545 + }, + { + "epoch": 4.874901652242329, + "grad_norm": 0.1932106423562817, + "learning_rate": 2.359832611581283e-05, + "loss": 0.385, + "step": 1546 + }, + { + "epoch": 4.878048780487805, + "grad_norm": 0.186869483874634, + "learning_rate": 2.358000215755909e-05, + "loss": 0.388, + "step": 1547 + }, + { + "epoch": 4.881195908733281, + "grad_norm": 0.18001846076878525, + "learning_rate": 2.3561676549055646e-05, + "loss": 0.3915, + "step": 1548 + }, + { + "epoch": 4.884343036978757, + "grad_norm": 0.1920048629271822, + "learning_rate": 2.3543349309442887e-05, + "loss": 0.392, + "step": 1549 + }, + { + "epoch": 4.887490165224233, + "grad_norm": 0.17088394010939248, + "learning_rate": 2.3525020457862878e-05, + "loss": 0.3964, + "step": 1550 + }, + { + "epoch": 4.890637293469709, + "grad_norm": 0.19786920135937375, + "learning_rate": 2.3506690013459376e-05, + "loss": 0.3843, + "step": 1551 + }, + { + "epoch": 4.893784421715185, + "grad_norm": 0.18033263963836252, + "learning_rate": 2.348835799537782e-05, + "loss": 0.3951, + "step": 1552 + }, + { + "epoch": 4.8969315499606605, + "grad_norm": 0.20462979636333165, + "learning_rate": 2.3470024422765267e-05, + "loss": 0.3913, + "step": 1553 + }, + { + "epoch": 4.900078678206137, + "grad_norm": 0.1679693396240305, + "learning_rate": 2.3451689314770404e-05, + "loss": 0.3933, + "step": 1554 + }, + { + "epoch": 4.903225806451613, + "grad_norm": 0.18705636136779824, + "learning_rate": 2.3433352690543533e-05, + "loss": 0.3875, + "step": 1555 + }, + { + "epoch": 4.906372934697089, + "grad_norm": 0.1725007429553329, + "learning_rate": 2.3415014569236522e-05, + "loss": 0.3922, + "step": 1556 + }, + { + "epoch": 4.909520062942565, + "grad_norm": 0.19115101454023312, + "learning_rate": 2.3396674970002824e-05, + "loss": 0.3865, + "step": 1557 + }, + { + "epoch": 4.912667191188041, + "grad_norm": 0.17053442035080676, + "learning_rate": 2.337833391199742e-05, + "loss": 0.3992, + "step": 1558 + }, + { + "epoch": 4.915814319433517, + "grad_norm": 0.1821548023265103, + "learning_rate": 2.3359991414376814e-05, + "loss": 0.388, + "step": 1559 + }, + { + "epoch": 4.918961447678993, + "grad_norm": 0.1793151595611094, + "learning_rate": 2.3341647496299025e-05, + "loss": 0.3893, + "step": 1560 + }, + { + "epoch": 4.922108575924469, + "grad_norm": 0.180962371465097, + "learning_rate": 2.3323302176923552e-05, + "loss": 0.3948, + "step": 1561 + }, + { + "epoch": 4.925255704169945, + "grad_norm": 0.20297348545826346, + "learning_rate": 2.3304955475411348e-05, + "loss": 0.3846, + "step": 1562 + }, + { + "epoch": 4.928402832415421, + "grad_norm": 0.17305830862076746, + "learning_rate": 2.3286607410924815e-05, + "loss": 0.3879, + "step": 1563 + }, + { + "epoch": 4.931549960660897, + "grad_norm": 0.1913355603249613, + "learning_rate": 2.3268258002627778e-05, + "loss": 0.394, + "step": 1564 + }, + { + "epoch": 4.934697088906373, + "grad_norm": 0.19290416318807532, + "learning_rate": 2.3249907269685473e-05, + "loss": 0.3894, + "step": 1565 + }, + { + "epoch": 4.937844217151849, + "grad_norm": 0.18295228802636587, + "learning_rate": 2.3231555231264525e-05, + "loss": 0.3948, + "step": 1566 + }, + { + "epoch": 4.940991345397325, + "grad_norm": 0.17938423256259314, + "learning_rate": 2.3213201906532895e-05, + "loss": 0.3899, + "step": 1567 + }, + { + "epoch": 4.944138473642801, + "grad_norm": 0.17823287523578704, + "learning_rate": 2.3194847314659908e-05, + "loss": 0.3903, + "step": 1568 + }, + { + "epoch": 4.947285601888277, + "grad_norm": 0.19013188360587602, + "learning_rate": 2.3176491474816207e-05, + "loss": 0.3892, + "step": 1569 + }, + { + "epoch": 4.950432730133753, + "grad_norm": 0.16191899543672794, + "learning_rate": 2.3158134406173742e-05, + "loss": 0.3901, + "step": 1570 + }, + { + "epoch": 4.9535798583792285, + "grad_norm": 0.18934530299370167, + "learning_rate": 2.3139776127905745e-05, + "loss": 0.392, + "step": 1571 + }, + { + "epoch": 4.956726986624705, + "grad_norm": 0.18625354508534378, + "learning_rate": 2.312141665918671e-05, + "loss": 0.393, + "step": 1572 + }, + { + "epoch": 4.959874114870181, + "grad_norm": 0.1716080119401114, + "learning_rate": 2.3103056019192373e-05, + "loss": 0.3934, + "step": 1573 + }, + { + "epoch": 4.963021243115657, + "grad_norm": 0.17658405885657685, + "learning_rate": 2.3084694227099704e-05, + "loss": 0.3929, + "step": 1574 + }, + { + "epoch": 4.966168371361133, + "grad_norm": 0.16588933283792434, + "learning_rate": 2.3066331302086858e-05, + "loss": 0.3994, + "step": 1575 + }, + { + "epoch": 4.969315499606609, + "grad_norm": 0.17724140434357114, + "learning_rate": 2.3047967263333192e-05, + "loss": 0.3866, + "step": 1576 + }, + { + "epoch": 4.972462627852085, + "grad_norm": 0.16829639574698707, + "learning_rate": 2.3029602130019208e-05, + "loss": 0.3939, + "step": 1577 + }, + { + "epoch": 4.975609756097561, + "grad_norm": 0.1696857849206108, + "learning_rate": 2.301123592132657e-05, + "loss": 0.3942, + "step": 1578 + }, + { + "epoch": 4.978756884343037, + "grad_norm": 0.1799379463657916, + "learning_rate": 2.2992868656438046e-05, + "loss": 0.3877, + "step": 1579 + }, + { + "epoch": 4.9819040125885135, + "grad_norm": 0.17173263518727672, + "learning_rate": 2.297450035453752e-05, + "loss": 0.3906, + "step": 1580 + }, + { + "epoch": 4.985051140833989, + "grad_norm": 0.16263371788270237, + "learning_rate": 2.2956131034809957e-05, + "loss": 0.3943, + "step": 1581 + }, + { + "epoch": 4.988198269079465, + "grad_norm": 0.18145271609433958, + "learning_rate": 2.293776071644139e-05, + "loss": 0.3993, + "step": 1582 + }, + { + "epoch": 4.991345397324941, + "grad_norm": 0.17931042976589995, + "learning_rate": 2.291938941861888e-05, + "loss": 0.3871, + "step": 1583 + }, + { + "epoch": 4.994492525570417, + "grad_norm": 0.16386736102567098, + "learning_rate": 2.290101716053053e-05, + "loss": 0.3738, + "step": 1584 + }, + { + "epoch": 4.997639653815893, + "grad_norm": 0.1709713086837328, + "learning_rate": 2.288264396136543e-05, + "loss": 0.3928, + "step": 1585 + }, + { + "epoch": 5.003147128245476, + "grad_norm": 0.46162362322835515, + "learning_rate": 2.2864269840313654e-05, + "loss": 0.723, + "step": 1586 + }, + { + "epoch": 5.006294256490952, + "grad_norm": 0.34659232101376264, + "learning_rate": 2.284589481656625e-05, + "loss": 0.3346, + "step": 1587 + }, + { + "epoch": 5.009441384736428, + "grad_norm": 0.34172986216672346, + "learning_rate": 2.2827518909315206e-05, + "loss": 0.3367, + "step": 1588 + }, + { + "epoch": 5.012588512981904, + "grad_norm": 0.4217601397562919, + "learning_rate": 2.2809142137753422e-05, + "loss": 0.3196, + "step": 1589 + }, + { + "epoch": 5.01573564122738, + "grad_norm": 0.27498607831845434, + "learning_rate": 2.2790764521074717e-05, + "loss": 0.3274, + "step": 1590 + }, + { + "epoch": 5.018882769472856, + "grad_norm": 0.337732177614199, + "learning_rate": 2.2772386078473775e-05, + "loss": 0.3283, + "step": 1591 + }, + { + "epoch": 5.022029897718332, + "grad_norm": 0.31183657652743435, + "learning_rate": 2.2754006829146155e-05, + "loss": 0.3296, + "step": 1592 + }, + { + "epoch": 5.025177025963808, + "grad_norm": 0.3478314549506538, + "learning_rate": 2.2735626792288263e-05, + "loss": 0.3268, + "step": 1593 + }, + { + "epoch": 5.028324154209284, + "grad_norm": 0.321081647039957, + "learning_rate": 2.27172459870973e-05, + "loss": 0.3216, + "step": 1594 + }, + { + "epoch": 5.03147128245476, + "grad_norm": 0.27287489022056094, + "learning_rate": 2.2698864432771313e-05, + "loss": 0.3324, + "step": 1595 + }, + { + "epoch": 5.034618410700236, + "grad_norm": 0.3377680793066894, + "learning_rate": 2.2680482148509092e-05, + "loss": 0.33, + "step": 1596 + }, + { + "epoch": 5.037765538945712, + "grad_norm": 0.3099962793181279, + "learning_rate": 2.266209915351021e-05, + "loss": 0.3208, + "step": 1597 + }, + { + "epoch": 5.040912667191188, + "grad_norm": 0.25169801292292504, + "learning_rate": 2.2643715466974975e-05, + "loss": 0.3261, + "step": 1598 + }, + { + "epoch": 5.044059795436664, + "grad_norm": 0.30251965731477554, + "learning_rate": 2.2625331108104426e-05, + "loss": 0.3217, + "step": 1599 + }, + { + "epoch": 5.04720692368214, + "grad_norm": 0.23662305302926548, + "learning_rate": 2.2606946096100294e-05, + "loss": 0.3315, + "step": 1600 + }, + { + "epoch": 5.050354051927616, + "grad_norm": 0.2651596494454096, + "learning_rate": 2.258856045016499e-05, + "loss": 0.3345, + "step": 1601 + }, + { + "epoch": 5.053501180173092, + "grad_norm": 0.2637402059683434, + "learning_rate": 2.2570174189501608e-05, + "loss": 0.3269, + "step": 1602 + }, + { + "epoch": 5.056648308418568, + "grad_norm": 0.23602081515972934, + "learning_rate": 2.255178733331385e-05, + "loss": 0.3229, + "step": 1603 + }, + { + "epoch": 5.059795436664044, + "grad_norm": 0.25433290519235396, + "learning_rate": 2.253339990080608e-05, + "loss": 0.3191, + "step": 1604 + }, + { + "epoch": 5.06294256490952, + "grad_norm": 0.23107096923107467, + "learning_rate": 2.251501191118323e-05, + "loss": 0.3356, + "step": 1605 + }, + { + "epoch": 5.066089693154996, + "grad_norm": 0.25637351601908676, + "learning_rate": 2.2496623383650828e-05, + "loss": 0.3265, + "step": 1606 + }, + { + "epoch": 5.069236821400472, + "grad_norm": 0.21472300935571184, + "learning_rate": 2.2478234337414962e-05, + "loss": 0.33, + "step": 1607 + }, + { + "epoch": 5.072383949645948, + "grad_norm": 0.2338555364338194, + "learning_rate": 2.245984479168227e-05, + "loss": 0.3298, + "step": 1608 + }, + { + "epoch": 5.075531077891424, + "grad_norm": 0.20022516446625999, + "learning_rate": 2.2441454765659897e-05, + "loss": 0.3342, + "step": 1609 + }, + { + "epoch": 5.0786782061369005, + "grad_norm": 0.22488563846995296, + "learning_rate": 2.2423064278555503e-05, + "loss": 0.326, + "step": 1610 + }, + { + "epoch": 5.081825334382376, + "grad_norm": 0.21302916365806326, + "learning_rate": 2.2404673349577218e-05, + "loss": 0.3282, + "step": 1611 + }, + { + "epoch": 5.084972462627852, + "grad_norm": 0.21796018211989795, + "learning_rate": 2.2386281997933646e-05, + "loss": 0.3258, + "step": 1612 + }, + { + "epoch": 5.088119590873328, + "grad_norm": 0.22425397553381501, + "learning_rate": 2.2367890242833815e-05, + "loss": 0.3297, + "step": 1613 + }, + { + "epoch": 5.091266719118804, + "grad_norm": 0.19706564657591386, + "learning_rate": 2.2349498103487197e-05, + "loss": 0.3273, + "step": 1614 + }, + { + "epoch": 5.09441384736428, + "grad_norm": 0.2071088006731519, + "learning_rate": 2.233110559910365e-05, + "loss": 0.3211, + "step": 1615 + }, + { + "epoch": 5.097560975609756, + "grad_norm": 0.21348102050857448, + "learning_rate": 2.2312712748893403e-05, + "loss": 0.3232, + "step": 1616 + }, + { + "epoch": 5.100708103855232, + "grad_norm": 0.18908349503115035, + "learning_rate": 2.2294319572067082e-05, + "loss": 0.3229, + "step": 1617 + }, + { + "epoch": 5.103855232100708, + "grad_norm": 0.21575807977186254, + "learning_rate": 2.2275926087835625e-05, + "loss": 0.3229, + "step": 1618 + }, + { + "epoch": 5.1070023603461845, + "grad_norm": 0.20410626875283436, + "learning_rate": 2.2257532315410288e-05, + "loss": 0.3261, + "step": 1619 + }, + { + "epoch": 5.11014948859166, + "grad_norm": 0.23313876176564874, + "learning_rate": 2.2239138274002642e-05, + "loss": 0.3298, + "step": 1620 + }, + { + "epoch": 5.113296616837136, + "grad_norm": 0.19848085470082366, + "learning_rate": 2.2220743982824536e-05, + "loss": 0.3244, + "step": 1621 + }, + { + "epoch": 5.116443745082612, + "grad_norm": 0.23764358322337617, + "learning_rate": 2.2202349461088084e-05, + "loss": 0.3316, + "step": 1622 + }, + { + "epoch": 5.119590873328088, + "grad_norm": 0.20745220698687916, + "learning_rate": 2.2183954728005625e-05, + "loss": 0.3225, + "step": 1623 + }, + { + "epoch": 5.122738001573564, + "grad_norm": 0.223041599846075, + "learning_rate": 2.216555980278974e-05, + "loss": 0.3261, + "step": 1624 + }, + { + "epoch": 5.12588512981904, + "grad_norm": 0.19422755456096907, + "learning_rate": 2.2147164704653202e-05, + "loss": 0.3271, + "step": 1625 + }, + { + "epoch": 5.129032258064516, + "grad_norm": 0.20440115995525865, + "learning_rate": 2.2128769452808956e-05, + "loss": 0.3272, + "step": 1626 + }, + { + "epoch": 5.132179386309992, + "grad_norm": 0.2187283808498755, + "learning_rate": 2.211037406647011e-05, + "loss": 0.3265, + "step": 1627 + }, + { + "epoch": 5.1353265145554685, + "grad_norm": 0.2050591037215658, + "learning_rate": 2.2091978564849926e-05, + "loss": 0.3229, + "step": 1628 + }, + { + "epoch": 5.138473642800944, + "grad_norm": 0.22811383006695085, + "learning_rate": 2.2073582967161768e-05, + "loss": 0.336, + "step": 1629 + }, + { + "epoch": 5.14162077104642, + "grad_norm": 0.21037766403293978, + "learning_rate": 2.2055187292619112e-05, + "loss": 0.3234, + "step": 1630 + }, + { + "epoch": 5.144767899291896, + "grad_norm": 0.22544584006363033, + "learning_rate": 2.2036791560435522e-05, + "loss": 0.3232, + "step": 1631 + }, + { + "epoch": 5.147915027537372, + "grad_norm": 0.23307986768402664, + "learning_rate": 2.20183957898246e-05, + "loss": 0.3299, + "step": 1632 + }, + { + "epoch": 5.151062155782848, + "grad_norm": 0.20220722711990272, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.3215, + "step": 1633 + }, + { + "epoch": 5.154209284028324, + "grad_norm": 0.21396702627741238, + "learning_rate": 2.1981604210175407e-05, + "loss": 0.3261, + "step": 1634 + }, + { + "epoch": 5.1573564122738, + "grad_norm": 0.23101866923319364, + "learning_rate": 2.196320843956449e-05, + "loss": 0.3234, + "step": 1635 + }, + { + "epoch": 5.160503540519276, + "grad_norm": 0.22249676919651665, + "learning_rate": 2.1944812707380897e-05, + "loss": 0.3278, + "step": 1636 + }, + { + "epoch": 5.1636506687647525, + "grad_norm": 0.21159027665052352, + "learning_rate": 2.1926417032838238e-05, + "loss": 0.3261, + "step": 1637 + }, + { + "epoch": 5.166797797010228, + "grad_norm": 0.20441901079236766, + "learning_rate": 2.1908021435150083e-05, + "loss": 0.3249, + "step": 1638 + }, + { + "epoch": 5.169944925255704, + "grad_norm": 0.22690097885692212, + "learning_rate": 2.18896259335299e-05, + "loss": 0.3263, + "step": 1639 + }, + { + "epoch": 5.17309205350118, + "grad_norm": 0.19474865782338907, + "learning_rate": 2.1871230547191057e-05, + "loss": 0.3241, + "step": 1640 + }, + { + "epoch": 5.176239181746656, + "grad_norm": 0.24748820815778508, + "learning_rate": 2.18528352953468e-05, + "loss": 0.3293, + "step": 1641 + }, + { + "epoch": 5.1793863099921325, + "grad_norm": 0.21000623423513556, + "learning_rate": 2.1834440197210254e-05, + "loss": 0.3396, + "step": 1642 + }, + { + "epoch": 5.182533438237608, + "grad_norm": 0.2297339762152351, + "learning_rate": 2.1816045271994377e-05, + "loss": 0.3355, + "step": 1643 + }, + { + "epoch": 5.185680566483084, + "grad_norm": 0.23065919694389042, + "learning_rate": 2.1797650538911922e-05, + "loss": 0.3266, + "step": 1644 + }, + { + "epoch": 5.18882769472856, + "grad_norm": 0.21981603962817217, + "learning_rate": 2.1779256017175473e-05, + "loss": 0.3216, + "step": 1645 + }, + { + "epoch": 5.191974822974037, + "grad_norm": 0.2608827800438322, + "learning_rate": 2.1760861725997367e-05, + "loss": 0.3191, + "step": 1646 + }, + { + "epoch": 5.195121951219512, + "grad_norm": 0.19452350370213584, + "learning_rate": 2.1742467684589725e-05, + "loss": 0.3259, + "step": 1647 + }, + { + "epoch": 5.198269079464988, + "grad_norm": 0.22996447660538494, + "learning_rate": 2.1724073912164387e-05, + "loss": 0.3284, + "step": 1648 + }, + { + "epoch": 5.201416207710464, + "grad_norm": 0.22489712820890972, + "learning_rate": 2.170568042793292e-05, + "loss": 0.3229, + "step": 1649 + }, + { + "epoch": 5.20456333595594, + "grad_norm": 0.2002513690412124, + "learning_rate": 2.16872872511066e-05, + "loss": 0.3335, + "step": 1650 + }, + { + "epoch": 5.2077104642014165, + "grad_norm": 0.21768101783798655, + "learning_rate": 2.166889440089636e-05, + "loss": 0.3197, + "step": 1651 + }, + { + "epoch": 5.210857592446892, + "grad_norm": 0.2105177118679401, + "learning_rate": 2.165050189651281e-05, + "loss": 0.3312, + "step": 1652 + }, + { + "epoch": 5.214004720692368, + "grad_norm": 0.21009669854087792, + "learning_rate": 2.163210975716619e-05, + "loss": 0.3288, + "step": 1653 + }, + { + "epoch": 5.217151848937844, + "grad_norm": 0.21015093693379167, + "learning_rate": 2.1613718002066363e-05, + "loss": 0.3296, + "step": 1654 + }, + { + "epoch": 5.220298977183321, + "grad_norm": 0.22642270974424877, + "learning_rate": 2.1595326650422784e-05, + "loss": 0.325, + "step": 1655 + }, + { + "epoch": 5.223446105428796, + "grad_norm": 0.20862201953387366, + "learning_rate": 2.15769357214445e-05, + "loss": 0.3287, + "step": 1656 + }, + { + "epoch": 5.226593233674272, + "grad_norm": 0.23556943931498991, + "learning_rate": 2.1558545234340108e-05, + "loss": 0.3208, + "step": 1657 + }, + { + "epoch": 5.229740361919748, + "grad_norm": 0.21273958624925166, + "learning_rate": 2.1540155208317736e-05, + "loss": 0.3254, + "step": 1658 + }, + { + "epoch": 5.232887490165224, + "grad_norm": 0.21120587575901487, + "learning_rate": 2.1521765662585047e-05, + "loss": 0.3278, + "step": 1659 + }, + { + "epoch": 5.2360346184107005, + "grad_norm": 0.2191912573575056, + "learning_rate": 2.150337661634918e-05, + "loss": 0.3275, + "step": 1660 + }, + { + "epoch": 5.239181746656176, + "grad_norm": 0.18918312365625706, + "learning_rate": 2.1484988088816784e-05, + "loss": 0.3245, + "step": 1661 + }, + { + "epoch": 5.242328874901652, + "grad_norm": 0.24442600792973201, + "learning_rate": 2.146660009919393e-05, + "loss": 0.3366, + "step": 1662 + }, + { + "epoch": 5.245476003147128, + "grad_norm": 0.19190784043500905, + "learning_rate": 2.1448212666686153e-05, + "loss": 0.3235, + "step": 1663 + }, + { + "epoch": 5.248623131392605, + "grad_norm": 0.19845803273670526, + "learning_rate": 2.1429825810498405e-05, + "loss": 0.3247, + "step": 1664 + }, + { + "epoch": 5.25177025963808, + "grad_norm": 0.22683790832172754, + "learning_rate": 2.141143954983502e-05, + "loss": 0.3277, + "step": 1665 + }, + { + "epoch": 5.254917387883556, + "grad_norm": 0.20007675897146535, + "learning_rate": 2.1393053903899715e-05, + "loss": 0.3293, + "step": 1666 + }, + { + "epoch": 5.258064516129032, + "grad_norm": 0.22364967785365925, + "learning_rate": 2.1374668891895586e-05, + "loss": 0.3317, + "step": 1667 + }, + { + "epoch": 5.261211644374509, + "grad_norm": 0.19696570309865535, + "learning_rate": 2.1356284533025034e-05, + "loss": 0.3357, + "step": 1668 + }, + { + "epoch": 5.2643587726199845, + "grad_norm": 0.20720814373699586, + "learning_rate": 2.1337900846489794e-05, + "loss": 0.3304, + "step": 1669 + }, + { + "epoch": 5.26750590086546, + "grad_norm": 0.22251808274139923, + "learning_rate": 2.1319517851490917e-05, + "loss": 0.3342, + "step": 1670 + }, + { + "epoch": 5.270653029110936, + "grad_norm": 0.19960532937969883, + "learning_rate": 2.130113556722869e-05, + "loss": 0.3213, + "step": 1671 + }, + { + "epoch": 5.273800157356412, + "grad_norm": 0.22611359477988568, + "learning_rate": 2.12827540129027e-05, + "loss": 0.3304, + "step": 1672 + }, + { + "epoch": 5.276947285601889, + "grad_norm": 0.21377559505306823, + "learning_rate": 2.126437320771175e-05, + "loss": 0.333, + "step": 1673 + }, + { + "epoch": 5.280094413847364, + "grad_norm": 0.21364402742573374, + "learning_rate": 2.124599317085385e-05, + "loss": 0.3252, + "step": 1674 + }, + { + "epoch": 5.28324154209284, + "grad_norm": 0.20440996232135555, + "learning_rate": 2.1227613921526234e-05, + "loss": 0.3302, + "step": 1675 + }, + { + "epoch": 5.286388670338316, + "grad_norm": 0.20439524727339334, + "learning_rate": 2.1209235478925292e-05, + "loss": 0.327, + "step": 1676 + }, + { + "epoch": 5.289535798583792, + "grad_norm": 0.21129883255126156, + "learning_rate": 2.1190857862246587e-05, + "loss": 0.3317, + "step": 1677 + }, + { + "epoch": 5.2926829268292686, + "grad_norm": 0.1832955706368962, + "learning_rate": 2.1172481090684803e-05, + "loss": 0.3285, + "step": 1678 + }, + { + "epoch": 5.295830055074744, + "grad_norm": 0.21771566260776035, + "learning_rate": 2.1154105183433758e-05, + "loss": 0.3296, + "step": 1679 + }, + { + "epoch": 5.29897718332022, + "grad_norm": 0.1943908762637486, + "learning_rate": 2.1135730159686355e-05, + "loss": 0.3378, + "step": 1680 + }, + { + "epoch": 5.302124311565696, + "grad_norm": 0.19415021103057115, + "learning_rate": 2.1117356038634584e-05, + "loss": 0.3284, + "step": 1681 + }, + { + "epoch": 5.305271439811173, + "grad_norm": 0.19441982459516802, + "learning_rate": 2.109898283946948e-05, + "loss": 0.3238, + "step": 1682 + }, + { + "epoch": 5.3084185680566485, + "grad_norm": 0.19773621537262287, + "learning_rate": 2.1080610581381128e-05, + "loss": 0.3285, + "step": 1683 + }, + { + "epoch": 5.311565696302124, + "grad_norm": 0.2120736125028019, + "learning_rate": 2.106223928355861e-05, + "loss": 0.3324, + "step": 1684 + }, + { + "epoch": 5.3147128245476, + "grad_norm": 0.19760073719764953, + "learning_rate": 2.1043868965190045e-05, + "loss": 0.3324, + "step": 1685 + }, + { + "epoch": 5.317859952793077, + "grad_norm": 0.19405070182884, + "learning_rate": 2.1025499645462485e-05, + "loss": 0.3375, + "step": 1686 + }, + { + "epoch": 5.321007081038553, + "grad_norm": 0.1956189468411377, + "learning_rate": 2.100713134356196e-05, + "loss": 0.3255, + "step": 1687 + }, + { + "epoch": 5.324154209284028, + "grad_norm": 0.19321084864706617, + "learning_rate": 2.098876407867344e-05, + "loss": 0.3308, + "step": 1688 + }, + { + "epoch": 5.327301337529504, + "grad_norm": 0.19304288190055158, + "learning_rate": 2.0970397869980798e-05, + "loss": 0.3286, + "step": 1689 + }, + { + "epoch": 5.33044846577498, + "grad_norm": 0.1986064395829299, + "learning_rate": 2.0952032736666817e-05, + "loss": 0.3291, + "step": 1690 + }, + { + "epoch": 5.333595594020457, + "grad_norm": 0.19746810657897224, + "learning_rate": 2.0933668697913148e-05, + "loss": 0.3336, + "step": 1691 + }, + { + "epoch": 5.3367427222659325, + "grad_norm": 0.19729660360055334, + "learning_rate": 2.09153057729003e-05, + "loss": 0.3348, + "step": 1692 + }, + { + "epoch": 5.339889850511408, + "grad_norm": 0.18855256963341346, + "learning_rate": 2.0896943980807633e-05, + "loss": 0.3372, + "step": 1693 + }, + { + "epoch": 5.343036978756884, + "grad_norm": 0.2009978457776624, + "learning_rate": 2.0878583340813295e-05, + "loss": 0.3288, + "step": 1694 + }, + { + "epoch": 5.34618410700236, + "grad_norm": 0.20225991858456713, + "learning_rate": 2.0860223872094264e-05, + "loss": 0.3271, + "step": 1695 + }, + { + "epoch": 5.349331235247837, + "grad_norm": 0.18569033139215133, + "learning_rate": 2.084186559382627e-05, + "loss": 0.3287, + "step": 1696 + }, + { + "epoch": 5.352478363493312, + "grad_norm": 0.19538260959362502, + "learning_rate": 2.0823508525183805e-05, + "loss": 0.3249, + "step": 1697 + }, + { + "epoch": 5.355625491738788, + "grad_norm": 0.20717894047667273, + "learning_rate": 2.08051526853401e-05, + "loss": 0.3336, + "step": 1698 + }, + { + "epoch": 5.358772619984264, + "grad_norm": 0.19701862641818213, + "learning_rate": 2.0786798093467114e-05, + "loss": 0.3344, + "step": 1699 + }, + { + "epoch": 5.361919748229741, + "grad_norm": 0.1906266432884064, + "learning_rate": 2.0768444768735478e-05, + "loss": 0.3334, + "step": 1700 + }, + { + "epoch": 5.3650668764752165, + "grad_norm": 0.21331000047158513, + "learning_rate": 2.0750092730314522e-05, + "loss": 0.3349, + "step": 1701 + }, + { + "epoch": 5.368214004720692, + "grad_norm": 0.19487279888174047, + "learning_rate": 2.0731741997372228e-05, + "loss": 0.3211, + "step": 1702 + }, + { + "epoch": 5.371361132966168, + "grad_norm": 0.21324736219816784, + "learning_rate": 2.071339258907519e-05, + "loss": 0.3385, + "step": 1703 + }, + { + "epoch": 5.374508261211645, + "grad_norm": 0.18260011303295876, + "learning_rate": 2.0695044524588658e-05, + "loss": 0.332, + "step": 1704 + }, + { + "epoch": 5.377655389457121, + "grad_norm": 0.21643053077446903, + "learning_rate": 2.0676697823076453e-05, + "loss": 0.326, + "step": 1705 + }, + { + "epoch": 5.380802517702596, + "grad_norm": 0.18935558592473964, + "learning_rate": 2.065835250370098e-05, + "loss": 0.3286, + "step": 1706 + }, + { + "epoch": 5.383949645948072, + "grad_norm": 0.21738265810196228, + "learning_rate": 2.064000858562319e-05, + "loss": 0.327, + "step": 1707 + }, + { + "epoch": 5.387096774193548, + "grad_norm": 0.188452014974482, + "learning_rate": 2.0621666088002586e-05, + "loss": 0.3363, + "step": 1708 + }, + { + "epoch": 5.390243902439025, + "grad_norm": 0.21149246855169024, + "learning_rate": 2.060332502999719e-05, + "loss": 0.3342, + "step": 1709 + }, + { + "epoch": 5.3933910306845005, + "grad_norm": 0.1994954339555043, + "learning_rate": 2.0584985430763483e-05, + "loss": 0.333, + "step": 1710 + }, + { + "epoch": 5.396538158929976, + "grad_norm": 0.18853756790169016, + "learning_rate": 2.0566647309456476e-05, + "loss": 0.3344, + "step": 1711 + }, + { + "epoch": 5.399685287175452, + "grad_norm": 0.19943484129450234, + "learning_rate": 2.0548310685229605e-05, + "loss": 0.3345, + "step": 1712 + }, + { + "epoch": 5.402832415420928, + "grad_norm": 0.1941995141451995, + "learning_rate": 2.052997557723474e-05, + "loss": 0.3282, + "step": 1713 + }, + { + "epoch": 5.405979543666405, + "grad_norm": 0.19191713025156307, + "learning_rate": 2.051164200462218e-05, + "loss": 0.3345, + "step": 1714 + }, + { + "epoch": 5.4091266719118805, + "grad_norm": 0.18974347512799264, + "learning_rate": 2.0493309986540626e-05, + "loss": 0.3413, + "step": 1715 + }, + { + "epoch": 5.412273800157356, + "grad_norm": 0.19403906103651297, + "learning_rate": 2.047497954213713e-05, + "loss": 0.33, + "step": 1716 + }, + { + "epoch": 5.415420928402832, + "grad_norm": 0.19827267122676, + "learning_rate": 2.0456650690557126e-05, + "loss": 0.3347, + "step": 1717 + }, + { + "epoch": 5.418568056648309, + "grad_norm": 0.18940704828298557, + "learning_rate": 2.043832345094436e-05, + "loss": 0.331, + "step": 1718 + }, + { + "epoch": 5.421715184893785, + "grad_norm": 0.19382447016721407, + "learning_rate": 2.041999784244092e-05, + "loss": 0.3403, + "step": 1719 + }, + { + "epoch": 5.42486231313926, + "grad_norm": 0.1908670547017546, + "learning_rate": 2.0401673884187178e-05, + "loss": 0.3382, + "step": 1720 + }, + { + "epoch": 5.428009441384736, + "grad_norm": 0.18977270804467197, + "learning_rate": 2.0383351595321777e-05, + "loss": 0.3269, + "step": 1721 + }, + { + "epoch": 5.431156569630213, + "grad_norm": 0.19390756152600916, + "learning_rate": 2.036503099498163e-05, + "loss": 0.3351, + "step": 1722 + }, + { + "epoch": 5.434303697875689, + "grad_norm": 0.1822739884958527, + "learning_rate": 2.034671210230187e-05, + "loss": 0.3283, + "step": 1723 + }, + { + "epoch": 5.4374508261211645, + "grad_norm": 0.20644018287214425, + "learning_rate": 2.0328394936415862e-05, + "loss": 0.333, + "step": 1724 + }, + { + "epoch": 5.44059795436664, + "grad_norm": 0.19229713655862757, + "learning_rate": 2.0310079516455158e-05, + "loss": 0.3336, + "step": 1725 + }, + { + "epoch": 5.443745082612116, + "grad_norm": 0.19157755715085112, + "learning_rate": 2.0291765861549485e-05, + "loss": 0.3319, + "step": 1726 + }, + { + "epoch": 5.446892210857593, + "grad_norm": 0.20424545745124856, + "learning_rate": 2.0273453990826734e-05, + "loss": 0.3368, + "step": 1727 + }, + { + "epoch": 5.450039339103069, + "grad_norm": 0.19085673571348755, + "learning_rate": 2.0255143923412926e-05, + "loss": 0.3334, + "step": 1728 + }, + { + "epoch": 5.453186467348544, + "grad_norm": 0.18918109876476505, + "learning_rate": 2.0236835678432216e-05, + "loss": 0.3475, + "step": 1729 + }, + { + "epoch": 5.45633359559402, + "grad_norm": 0.20280285926641467, + "learning_rate": 2.0218529275006823e-05, + "loss": 0.3286, + "step": 1730 + }, + { + "epoch": 5.459480723839496, + "grad_norm": 0.1916773765490627, + "learning_rate": 2.020022473225707e-05, + "loss": 0.3391, + "step": 1731 + }, + { + "epoch": 5.462627852084973, + "grad_norm": 0.19384714862396943, + "learning_rate": 2.0181922069301323e-05, + "loss": 0.3264, + "step": 1732 + }, + { + "epoch": 5.4657749803304485, + "grad_norm": 0.18923980213552452, + "learning_rate": 2.016362130525599e-05, + "loss": 0.3331, + "step": 1733 + }, + { + "epoch": 5.468922108575924, + "grad_norm": 0.17898327293803962, + "learning_rate": 2.0145322459235496e-05, + "loss": 0.3362, + "step": 1734 + }, + { + "epoch": 5.4720692368214, + "grad_norm": 0.19587397865824116, + "learning_rate": 2.0127025550352255e-05, + "loss": 0.3199, + "step": 1735 + }, + { + "epoch": 5.475216365066877, + "grad_norm": 0.1941075025448282, + "learning_rate": 2.010873059771667e-05, + "loss": 0.3349, + "step": 1736 + }, + { + "epoch": 5.478363493312353, + "grad_norm": 0.1771702814313618, + "learning_rate": 2.0090437620437097e-05, + "loss": 0.3255, + "step": 1737 + }, + { + "epoch": 5.481510621557828, + "grad_norm": 0.19485668971197115, + "learning_rate": 2.0072146637619822e-05, + "loss": 0.3227, + "step": 1738 + }, + { + "epoch": 5.484657749803304, + "grad_norm": 0.19804684145553095, + "learning_rate": 2.0053857668369054e-05, + "loss": 0.3282, + "step": 1739 + }, + { + "epoch": 5.487804878048781, + "grad_norm": 0.1918642737386288, + "learning_rate": 2.0035570731786898e-05, + "loss": 0.3308, + "step": 1740 + }, + { + "epoch": 5.490952006294257, + "grad_norm": 0.196904058951029, + "learning_rate": 2.001728584697332e-05, + "loss": 0.3369, + "step": 1741 + }, + { + "epoch": 5.4940991345397325, + "grad_norm": 0.19456782195452768, + "learning_rate": 1.999900303302618e-05, + "loss": 0.3304, + "step": 1742 + }, + { + "epoch": 5.497246262785208, + "grad_norm": 0.18856607162185943, + "learning_rate": 1.9980722309041153e-05, + "loss": 0.3387, + "step": 1743 + }, + { + "epoch": 5.500393391030684, + "grad_norm": 0.19651502014126437, + "learning_rate": 1.996244369411171e-05, + "loss": 0.337, + "step": 1744 + }, + { + "epoch": 5.503540519276161, + "grad_norm": 0.18726822437116097, + "learning_rate": 1.9944167207329163e-05, + "loss": 0.3353, + "step": 1745 + }, + { + "epoch": 5.506687647521637, + "grad_norm": 0.19603676603474196, + "learning_rate": 1.992589286778257e-05, + "loss": 0.3258, + "step": 1746 + }, + { + "epoch": 5.5098347757671124, + "grad_norm": 0.182602364247368, + "learning_rate": 1.9907620694558757e-05, + "loss": 0.3324, + "step": 1747 + }, + { + "epoch": 5.512981904012588, + "grad_norm": 0.18782759246781305, + "learning_rate": 1.9889350706742278e-05, + "loss": 0.3314, + "step": 1748 + }, + { + "epoch": 5.516129032258064, + "grad_norm": 0.1966254586938506, + "learning_rate": 1.9871082923415418e-05, + "loss": 0.3361, + "step": 1749 + }, + { + "epoch": 5.519276160503541, + "grad_norm": 0.2000474732115638, + "learning_rate": 1.9852817363658157e-05, + "loss": 0.334, + "step": 1750 + }, + { + "epoch": 5.522423288749017, + "grad_norm": 0.18864691630189573, + "learning_rate": 1.983455404654814e-05, + "loss": 0.3384, + "step": 1751 + }, + { + "epoch": 5.525570416994492, + "grad_norm": 0.17699746763746177, + "learning_rate": 1.9816292991160682e-05, + "loss": 0.3369, + "step": 1752 + }, + { + "epoch": 5.528717545239968, + "grad_norm": 0.2046692057530345, + "learning_rate": 1.979803421656874e-05, + "loss": 0.3324, + "step": 1753 + }, + { + "epoch": 5.531864673485445, + "grad_norm": 0.17773907244324694, + "learning_rate": 1.977977774184287e-05, + "loss": 0.3306, + "step": 1754 + }, + { + "epoch": 5.535011801730921, + "grad_norm": 0.2060724377420153, + "learning_rate": 1.9761523586051247e-05, + "loss": 0.3347, + "step": 1755 + }, + { + "epoch": 5.5381589299763965, + "grad_norm": 0.19361078551446964, + "learning_rate": 1.9743271768259597e-05, + "loss": 0.3293, + "step": 1756 + }, + { + "epoch": 5.541306058221872, + "grad_norm": 0.20747437115821293, + "learning_rate": 1.9725022307531238e-05, + "loss": 0.3319, + "step": 1757 + }, + { + "epoch": 5.544453186467349, + "grad_norm": 0.20542783975059725, + "learning_rate": 1.970677522292701e-05, + "loss": 0.3293, + "step": 1758 + }, + { + "epoch": 5.547600314712825, + "grad_norm": 0.20422541341547726, + "learning_rate": 1.9688530533505262e-05, + "loss": 0.3298, + "step": 1759 + }, + { + "epoch": 5.550747442958301, + "grad_norm": 0.19261928228370265, + "learning_rate": 1.9670288258321844e-05, + "loss": 0.3291, + "step": 1760 + }, + { + "epoch": 5.553894571203776, + "grad_norm": 0.21510991198866017, + "learning_rate": 1.965204841643011e-05, + "loss": 0.3355, + "step": 1761 + }, + { + "epoch": 5.557041699449252, + "grad_norm": 0.20838170757363106, + "learning_rate": 1.9633811026880836e-05, + "loss": 0.3361, + "step": 1762 + }, + { + "epoch": 5.560188827694729, + "grad_norm": 0.19924189860550076, + "learning_rate": 1.961557610872226e-05, + "loss": 0.3408, + "step": 1763 + }, + { + "epoch": 5.563335955940205, + "grad_norm": 0.19631983491806956, + "learning_rate": 1.9597343681000026e-05, + "loss": 0.3314, + "step": 1764 + }, + { + "epoch": 5.5664830841856805, + "grad_norm": 0.20427189176713215, + "learning_rate": 1.9579113762757193e-05, + "loss": 0.3343, + "step": 1765 + }, + { + "epoch": 5.569630212431156, + "grad_norm": 0.20654636293900627, + "learning_rate": 1.956088637303418e-05, + "loss": 0.3391, + "step": 1766 + }, + { + "epoch": 5.572777340676632, + "grad_norm": 0.21104773479274883, + "learning_rate": 1.954266153086877e-05, + "loss": 0.342, + "step": 1767 + }, + { + "epoch": 5.575924468922109, + "grad_norm": 0.20296107799646898, + "learning_rate": 1.9524439255296105e-05, + "loss": 0.3327, + "step": 1768 + }, + { + "epoch": 5.579071597167585, + "grad_norm": 0.18312485399358078, + "learning_rate": 1.9506219565348622e-05, + "loss": 0.3423, + "step": 1769 + }, + { + "epoch": 5.58221872541306, + "grad_norm": 0.20577262423625045, + "learning_rate": 1.948800248005605e-05, + "loss": 0.3312, + "step": 1770 + }, + { + "epoch": 5.585365853658536, + "grad_norm": 0.18830784164844272, + "learning_rate": 1.946978801844544e-05, + "loss": 0.3314, + "step": 1771 + }, + { + "epoch": 5.588512981904013, + "grad_norm": 0.1983034432372846, + "learning_rate": 1.9451576199541063e-05, + "loss": 0.3369, + "step": 1772 + }, + { + "epoch": 5.591660110149489, + "grad_norm": 0.18977643322424897, + "learning_rate": 1.9433367042364447e-05, + "loss": 0.331, + "step": 1773 + }, + { + "epoch": 5.5948072383949645, + "grad_norm": 0.20788619352037782, + "learning_rate": 1.941516056593433e-05, + "loss": 0.3308, + "step": 1774 + }, + { + "epoch": 5.59795436664044, + "grad_norm": 0.18636419597292284, + "learning_rate": 1.9396956789266663e-05, + "loss": 0.3418, + "step": 1775 + }, + { + "epoch": 5.601101494885917, + "grad_norm": 0.2241714096919009, + "learning_rate": 1.9378755731374557e-05, + "loss": 0.3375, + "step": 1776 + }, + { + "epoch": 5.604248623131393, + "grad_norm": 0.17829719329566568, + "learning_rate": 1.9360557411268307e-05, + "loss": 0.3348, + "step": 1777 + }, + { + "epoch": 5.607395751376869, + "grad_norm": 0.20049814376921224, + "learning_rate": 1.9342361847955345e-05, + "loss": 0.3238, + "step": 1778 + }, + { + "epoch": 5.610542879622344, + "grad_norm": 0.20325169827837897, + "learning_rate": 1.9324169060440194e-05, + "loss": 0.3337, + "step": 1779 + }, + { + "epoch": 5.61369000786782, + "grad_norm": 0.18595173050891842, + "learning_rate": 1.930597906772452e-05, + "loss": 0.3361, + "step": 1780 + }, + { + "epoch": 5.616837136113297, + "grad_norm": 0.19965999907742346, + "learning_rate": 1.9287791888807048e-05, + "loss": 0.338, + "step": 1781 + }, + { + "epoch": 5.619984264358773, + "grad_norm": 0.19545992600515816, + "learning_rate": 1.9269607542683552e-05, + "loss": 0.3359, + "step": 1782 + }, + { + "epoch": 5.6231313926042485, + "grad_norm": 0.2188054711004221, + "learning_rate": 1.9251426048346877e-05, + "loss": 0.3378, + "step": 1783 + }, + { + "epoch": 5.626278520849724, + "grad_norm": 0.19412474756377676, + "learning_rate": 1.923324742478686e-05, + "loss": 0.336, + "step": 1784 + }, + { + "epoch": 5.6294256490952, + "grad_norm": 0.22045546487078188, + "learning_rate": 1.9215071690990365e-05, + "loss": 0.3237, + "step": 1785 + }, + { + "epoch": 5.632572777340677, + "grad_norm": 0.1936651347415692, + "learning_rate": 1.9196898865941227e-05, + "loss": 0.3348, + "step": 1786 + }, + { + "epoch": 5.635719905586153, + "grad_norm": 0.21256108861119585, + "learning_rate": 1.917872896862024e-05, + "loss": 0.3355, + "step": 1787 + }, + { + "epoch": 5.6388670338316285, + "grad_norm": 0.19411087471884958, + "learning_rate": 1.916056201800514e-05, + "loss": 0.328, + "step": 1788 + }, + { + "epoch": 5.642014162077104, + "grad_norm": 0.19203184010960683, + "learning_rate": 1.9142398033070585e-05, + "loss": 0.3332, + "step": 1789 + }, + { + "epoch": 5.645161290322581, + "grad_norm": 0.19116732074421183, + "learning_rate": 1.9124237032788144e-05, + "loss": 0.3289, + "step": 1790 + }, + { + "epoch": 5.648308418568057, + "grad_norm": 0.2029836866282038, + "learning_rate": 1.910607903612626e-05, + "loss": 0.3301, + "step": 1791 + }, + { + "epoch": 5.651455546813533, + "grad_norm": 0.18620591358236213, + "learning_rate": 1.9087924062050235e-05, + "loss": 0.3273, + "step": 1792 + }, + { + "epoch": 5.654602675059008, + "grad_norm": 0.194743778672627, + "learning_rate": 1.9069772129522236e-05, + "loss": 0.3361, + "step": 1793 + }, + { + "epoch": 5.657749803304485, + "grad_norm": 0.17439717206736727, + "learning_rate": 1.9051623257501223e-05, + "loss": 0.3359, + "step": 1794 + }, + { + "epoch": 5.660896931549961, + "grad_norm": 0.21536385200503502, + "learning_rate": 1.9033477464942985e-05, + "loss": 0.3316, + "step": 1795 + }, + { + "epoch": 5.664044059795437, + "grad_norm": 0.19927232488706062, + "learning_rate": 1.9015334770800084e-05, + "loss": 0.3428, + "step": 1796 + }, + { + "epoch": 5.6671911880409125, + "grad_norm": 0.18222541338952222, + "learning_rate": 1.899719519402183e-05, + "loss": 0.3371, + "step": 1797 + }, + { + "epoch": 5.670338316286388, + "grad_norm": 0.20776846100700097, + "learning_rate": 1.897905875355431e-05, + "loss": 0.3333, + "step": 1798 + }, + { + "epoch": 5.673485444531865, + "grad_norm": 0.20498121411547227, + "learning_rate": 1.89609254683403e-05, + "loss": 0.337, + "step": 1799 + }, + { + "epoch": 5.676632572777341, + "grad_norm": 0.2057071418903273, + "learning_rate": 1.8942795357319325e-05, + "loss": 0.3422, + "step": 1800 + }, + { + "epoch": 5.679779701022817, + "grad_norm": 0.18447805097886538, + "learning_rate": 1.892466843942754e-05, + "loss": 0.3357, + "step": 1801 + }, + { + "epoch": 5.682926829268292, + "grad_norm": 0.20151427634192418, + "learning_rate": 1.8906544733597817e-05, + "loss": 0.3341, + "step": 1802 + }, + { + "epoch": 5.686073957513768, + "grad_norm": 0.19101557171738096, + "learning_rate": 1.888842425875964e-05, + "loss": 0.3396, + "step": 1803 + }, + { + "epoch": 5.689221085759245, + "grad_norm": 0.2109852771210735, + "learning_rate": 1.887030703383912e-05, + "loss": 0.3392, + "step": 1804 + }, + { + "epoch": 5.692368214004721, + "grad_norm": 0.19115514267570324, + "learning_rate": 1.885219307775899e-05, + "loss": 0.3363, + "step": 1805 + }, + { + "epoch": 5.6955153422501965, + "grad_norm": 0.21293008134701097, + "learning_rate": 1.8834082409438553e-05, + "loss": 0.3328, + "step": 1806 + }, + { + "epoch": 5.698662470495672, + "grad_norm": 0.18526154866614372, + "learning_rate": 1.8815975047793694e-05, + "loss": 0.3273, + "step": 1807 + }, + { + "epoch": 5.701809598741149, + "grad_norm": 0.18777683810128346, + "learning_rate": 1.8797871011736823e-05, + "loss": 0.3392, + "step": 1808 + }, + { + "epoch": 5.704956726986625, + "grad_norm": 0.18257028157402827, + "learning_rate": 1.87797703201769e-05, + "loss": 0.3303, + "step": 1809 + }, + { + "epoch": 5.708103855232101, + "grad_norm": 0.19418106072360075, + "learning_rate": 1.8761672992019377e-05, + "loss": 0.3344, + "step": 1810 + }, + { + "epoch": 5.711250983477576, + "grad_norm": 0.18457444042449656, + "learning_rate": 1.87435790461662e-05, + "loss": 0.3278, + "step": 1811 + }, + { + "epoch": 5.714398111723053, + "grad_norm": 0.18582325962284854, + "learning_rate": 1.872548850151577e-05, + "loss": 0.3264, + "step": 1812 + }, + { + "epoch": 5.717545239968529, + "grad_norm": 0.18103618532916696, + "learning_rate": 1.8707401376962946e-05, + "loss": 0.3315, + "step": 1813 + }, + { + "epoch": 5.720692368214005, + "grad_norm": 0.18382505326981982, + "learning_rate": 1.8689317691399026e-05, + "loss": 0.3367, + "step": 1814 + }, + { + "epoch": 5.7238394964594805, + "grad_norm": 0.1908586561032934, + "learning_rate": 1.867123746371169e-05, + "loss": 0.3315, + "step": 1815 + }, + { + "epoch": 5.726986624704956, + "grad_norm": 0.18359448429174133, + "learning_rate": 1.865316071278503e-05, + "loss": 0.3352, + "step": 1816 + }, + { + "epoch": 5.730133752950433, + "grad_norm": 0.19607618023789522, + "learning_rate": 1.8635087457499485e-05, + "loss": 0.3319, + "step": 1817 + }, + { + "epoch": 5.733280881195909, + "grad_norm": 0.17583841531340308, + "learning_rate": 1.8617017716731865e-05, + "loss": 0.334, + "step": 1818 + }, + { + "epoch": 5.736428009441385, + "grad_norm": 0.1866341211246049, + "learning_rate": 1.8598951509355293e-05, + "loss": 0.33, + "step": 1819 + }, + { + "epoch": 5.7395751376868605, + "grad_norm": 0.18608286547921962, + "learning_rate": 1.8580888854239213e-05, + "loss": 0.3361, + "step": 1820 + }, + { + "epoch": 5.742722265932336, + "grad_norm": 0.18536619850476266, + "learning_rate": 1.856282977024935e-05, + "loss": 0.3387, + "step": 1821 + }, + { + "epoch": 5.745869394177813, + "grad_norm": 0.2015065945779001, + "learning_rate": 1.85447742762477e-05, + "loss": 0.3413, + "step": 1822 + }, + { + "epoch": 5.749016522423289, + "grad_norm": 0.19420792957675226, + "learning_rate": 1.8526722391092513e-05, + "loss": 0.3379, + "step": 1823 + }, + { + "epoch": 5.752163650668765, + "grad_norm": 0.20788168048073424, + "learning_rate": 1.850867413363827e-05, + "loss": 0.3299, + "step": 1824 + }, + { + "epoch": 5.755310778914241, + "grad_norm": 0.20012818061899734, + "learning_rate": 1.8490629522735658e-05, + "loss": 0.335, + "step": 1825 + }, + { + "epoch": 5.758457907159717, + "grad_norm": 0.19094123225319237, + "learning_rate": 1.8472588577231558e-05, + "loss": 0.3289, + "step": 1826 + }, + { + "epoch": 5.761605035405193, + "grad_norm": 0.1903248865857444, + "learning_rate": 1.8454551315969023e-05, + "loss": 0.3328, + "step": 1827 + }, + { + "epoch": 5.764752163650669, + "grad_norm": 0.19908659395090694, + "learning_rate": 1.8436517757787268e-05, + "loss": 0.3289, + "step": 1828 + }, + { + "epoch": 5.7678992918961445, + "grad_norm": 0.19909089376488692, + "learning_rate": 1.841848792152162e-05, + "loss": 0.3317, + "step": 1829 + }, + { + "epoch": 5.771046420141621, + "grad_norm": 0.18638784799057997, + "learning_rate": 1.8400461826003536e-05, + "loss": 0.3296, + "step": 1830 + }, + { + "epoch": 5.774193548387097, + "grad_norm": 0.21107469641971743, + "learning_rate": 1.8382439490060556e-05, + "loss": 0.341, + "step": 1831 + }, + { + "epoch": 5.777340676632573, + "grad_norm": 0.18826238700241835, + "learning_rate": 1.8364420932516296e-05, + "loss": 0.3352, + "step": 1832 + }, + { + "epoch": 5.780487804878049, + "grad_norm": 0.2011868737529004, + "learning_rate": 1.8346406172190415e-05, + "loss": 0.3373, + "step": 1833 + }, + { + "epoch": 5.783634933123524, + "grad_norm": 0.18535453103657012, + "learning_rate": 1.8328395227898638e-05, + "loss": 0.3324, + "step": 1834 + }, + { + "epoch": 5.786782061369001, + "grad_norm": 0.2043412279322104, + "learning_rate": 1.8310388118452676e-05, + "loss": 0.3263, + "step": 1835 + }, + { + "epoch": 5.789929189614477, + "grad_norm": 0.18684348371083476, + "learning_rate": 1.829238486266023e-05, + "loss": 0.3286, + "step": 1836 + }, + { + "epoch": 5.793076317859953, + "grad_norm": 0.20419794528852878, + "learning_rate": 1.8274385479325003e-05, + "loss": 0.3272, + "step": 1837 + }, + { + "epoch": 5.7962234461054285, + "grad_norm": 0.1940073858019324, + "learning_rate": 1.825638998724663e-05, + "loss": 0.3332, + "step": 1838 + }, + { + "epoch": 5.799370574350904, + "grad_norm": 0.19347840957399853, + "learning_rate": 1.8238398405220693e-05, + "loss": 0.3351, + "step": 1839 + }, + { + "epoch": 5.802517702596381, + "grad_norm": 0.1889872990752563, + "learning_rate": 1.8220410752038683e-05, + "loss": 0.3316, + "step": 1840 + }, + { + "epoch": 5.805664830841857, + "grad_norm": 0.18586354781067246, + "learning_rate": 1.8202427046487998e-05, + "loss": 0.3341, + "step": 1841 + }, + { + "epoch": 5.808811959087333, + "grad_norm": 0.1966780805383021, + "learning_rate": 1.8184447307351892e-05, + "loss": 0.3355, + "step": 1842 + }, + { + "epoch": 5.811959087332809, + "grad_norm": 0.18121589168030264, + "learning_rate": 1.8166471553409515e-05, + "loss": 0.3383, + "step": 1843 + }, + { + "epoch": 5.815106215578285, + "grad_norm": 0.20505699055277316, + "learning_rate": 1.8148499803435814e-05, + "loss": 0.3398, + "step": 1844 + }, + { + "epoch": 5.818253343823761, + "grad_norm": 0.18962686805322654, + "learning_rate": 1.8130532076201567e-05, + "loss": 0.3265, + "step": 1845 + }, + { + "epoch": 5.821400472069237, + "grad_norm": 0.18582561112060875, + "learning_rate": 1.811256839047337e-05, + "loss": 0.3293, + "step": 1846 + }, + { + "epoch": 5.8245476003147125, + "grad_norm": 0.19257957858004238, + "learning_rate": 1.809460876501356e-05, + "loss": 0.3262, + "step": 1847 + }, + { + "epoch": 5.827694728560189, + "grad_norm": 0.197182828158111, + "learning_rate": 1.8076653218580275e-05, + "loss": 0.3323, + "step": 1848 + }, + { + "epoch": 5.830841856805665, + "grad_norm": 0.1879475617365794, + "learning_rate": 1.8058701769927355e-05, + "loss": 0.334, + "step": 1849 + }, + { + "epoch": 5.833988985051141, + "grad_norm": 0.1826490417159589, + "learning_rate": 1.8040754437804394e-05, + "loss": 0.342, + "step": 1850 + }, + { + "epoch": 5.837136113296617, + "grad_norm": 0.17633023503177056, + "learning_rate": 1.8022811240956658e-05, + "loss": 0.3273, + "step": 1851 + }, + { + "epoch": 5.840283241542092, + "grad_norm": 0.18968026969150417, + "learning_rate": 1.800487219812511e-05, + "loss": 0.346, + "step": 1852 + }, + { + "epoch": 5.843430369787569, + "grad_norm": 0.1784380943724687, + "learning_rate": 1.7986937328046367e-05, + "loss": 0.3303, + "step": 1853 + }, + { + "epoch": 5.846577498033045, + "grad_norm": 0.1823752757582174, + "learning_rate": 1.796900664945269e-05, + "loss": 0.34, + "step": 1854 + }, + { + "epoch": 5.849724626278521, + "grad_norm": 0.1813510684645491, + "learning_rate": 1.795108018107197e-05, + "loss": 0.3412, + "step": 1855 + }, + { + "epoch": 5.8528717545239966, + "grad_norm": 0.1807432625218474, + "learning_rate": 1.7933157941627685e-05, + "loss": 0.3373, + "step": 1856 + }, + { + "epoch": 5.856018882769473, + "grad_norm": 0.19949499343633706, + "learning_rate": 1.7915239949838912e-05, + "loss": 0.3287, + "step": 1857 + }, + { + "epoch": 5.859166011014949, + "grad_norm": 0.18250218771454552, + "learning_rate": 1.7897326224420278e-05, + "loss": 0.3405, + "step": 1858 + }, + { + "epoch": 5.862313139260425, + "grad_norm": 0.194020691023345, + "learning_rate": 1.7879416784081964e-05, + "loss": 0.3346, + "step": 1859 + }, + { + "epoch": 5.865460267505901, + "grad_norm": 0.18925126458655542, + "learning_rate": 1.7861511647529673e-05, + "loss": 0.3364, + "step": 1860 + }, + { + "epoch": 5.868607395751377, + "grad_norm": 0.1911546369042113, + "learning_rate": 1.7843610833464605e-05, + "loss": 0.341, + "step": 1861 + }, + { + "epoch": 5.871754523996853, + "grad_norm": 0.19577966808550543, + "learning_rate": 1.782571436058346e-05, + "loss": 0.3364, + "step": 1862 + }, + { + "epoch": 5.874901652242329, + "grad_norm": 0.17291565385793384, + "learning_rate": 1.7807822247578385e-05, + "loss": 0.3338, + "step": 1863 + }, + { + "epoch": 5.878048780487805, + "grad_norm": 0.18608030763102454, + "learning_rate": 1.7789934513136988e-05, + "loss": 0.3334, + "step": 1864 + }, + { + "epoch": 5.881195908733281, + "grad_norm": 0.1885179067929735, + "learning_rate": 1.7772051175942294e-05, + "loss": 0.3379, + "step": 1865 + }, + { + "epoch": 5.884343036978757, + "grad_norm": 0.19141910450267036, + "learning_rate": 1.7754172254672758e-05, + "loss": 0.3361, + "step": 1866 + }, + { + "epoch": 5.887490165224233, + "grad_norm": 0.19065019403622055, + "learning_rate": 1.7736297768002185e-05, + "loss": 0.3387, + "step": 1867 + }, + { + "epoch": 5.890637293469709, + "grad_norm": 0.19092741311791903, + "learning_rate": 1.7718427734599783e-05, + "loss": 0.3428, + "step": 1868 + }, + { + "epoch": 5.893784421715185, + "grad_norm": 0.18860250598218833, + "learning_rate": 1.770056217313009e-05, + "loss": 0.336, + "step": 1869 + }, + { + "epoch": 5.8969315499606605, + "grad_norm": 0.18271946305489614, + "learning_rate": 1.7682701102252972e-05, + "loss": 0.343, + "step": 1870 + }, + { + "epoch": 5.900078678206137, + "grad_norm": 0.19868228391520923, + "learning_rate": 1.7664844540623608e-05, + "loss": 0.3425, + "step": 1871 + }, + { + "epoch": 5.903225806451613, + "grad_norm": 0.2011603878538626, + "learning_rate": 1.764699250689249e-05, + "loss": 0.3353, + "step": 1872 + }, + { + "epoch": 5.906372934697089, + "grad_norm": 0.1802992343069088, + "learning_rate": 1.762914501970534e-05, + "loss": 0.3409, + "step": 1873 + }, + { + "epoch": 5.909520062942565, + "grad_norm": 0.20444272040489966, + "learning_rate": 1.7611302097703157e-05, + "loss": 0.3347, + "step": 1874 + }, + { + "epoch": 5.912667191188041, + "grad_norm": 0.1862979242929073, + "learning_rate": 1.7593463759522168e-05, + "loss": 0.3314, + "step": 1875 + }, + { + "epoch": 5.915814319433517, + "grad_norm": 0.20220253909918987, + "learning_rate": 1.7575630023793816e-05, + "loss": 0.3377, + "step": 1876 + }, + { + "epoch": 5.918961447678993, + "grad_norm": 0.18686277913829963, + "learning_rate": 1.7557800909144728e-05, + "loss": 0.3384, + "step": 1877 + }, + { + "epoch": 5.922108575924469, + "grad_norm": 0.22245978325014654, + "learning_rate": 1.75399764341967e-05, + "loss": 0.3441, + "step": 1878 + }, + { + "epoch": 5.925255704169945, + "grad_norm": 0.1959826366457521, + "learning_rate": 1.7522156617566707e-05, + "loss": 0.3347, + "step": 1879 + }, + { + "epoch": 5.928402832415421, + "grad_norm": 0.19789454724861258, + "learning_rate": 1.7504341477866824e-05, + "loss": 0.3321, + "step": 1880 + }, + { + "epoch": 5.931549960660897, + "grad_norm": 0.1924382850939243, + "learning_rate": 1.7486531033704265e-05, + "loss": 0.3326, + "step": 1881 + }, + { + "epoch": 5.934697088906373, + "grad_norm": 0.20590864368684875, + "learning_rate": 1.7468725303681345e-05, + "loss": 0.3342, + "step": 1882 + }, + { + "epoch": 5.937844217151849, + "grad_norm": 0.1797118421700361, + "learning_rate": 1.7450924306395434e-05, + "loss": 0.3397, + "step": 1883 + }, + { + "epoch": 5.940991345397325, + "grad_norm": 0.20148625619435923, + "learning_rate": 1.7433128060438966e-05, + "loss": 0.3316, + "step": 1884 + }, + { + "epoch": 5.944138473642801, + "grad_norm": 0.18364545990645328, + "learning_rate": 1.741533658439942e-05, + "loss": 0.3362, + "step": 1885 + }, + { + "epoch": 5.947285601888277, + "grad_norm": 0.20904376933935323, + "learning_rate": 1.7397549896859286e-05, + "loss": 0.3363, + "step": 1886 + }, + { + "epoch": 5.950432730133753, + "grad_norm": 0.17911675822308773, + "learning_rate": 1.7379768016396062e-05, + "loss": 0.3426, + "step": 1887 + }, + { + "epoch": 5.9535798583792285, + "grad_norm": 0.19232534709476046, + "learning_rate": 1.736199096158221e-05, + "loss": 0.3347, + "step": 1888 + }, + { + "epoch": 5.956726986624705, + "grad_norm": 0.19023265048985652, + "learning_rate": 1.7344218750985166e-05, + "loss": 0.3388, + "step": 1889 + }, + { + "epoch": 5.959874114870181, + "grad_norm": 0.17953463564757774, + "learning_rate": 1.7326451403167293e-05, + "loss": 0.3329, + "step": 1890 + }, + { + "epoch": 5.963021243115657, + "grad_norm": 0.1802520063583937, + "learning_rate": 1.7308688936685882e-05, + "loss": 0.3432, + "step": 1891 + }, + { + "epoch": 5.966168371361133, + "grad_norm": 0.17752047516280367, + "learning_rate": 1.729093137009314e-05, + "loss": 0.3333, + "step": 1892 + }, + { + "epoch": 5.969315499606609, + "grad_norm": 0.1986277401406285, + "learning_rate": 1.7273178721936128e-05, + "loss": 0.3368, + "step": 1893 + }, + { + "epoch": 5.972462627852085, + "grad_norm": 0.18718316187850806, + "learning_rate": 1.7255431010756785e-05, + "loss": 0.338, + "step": 1894 + }, + { + "epoch": 5.975609756097561, + "grad_norm": 0.19078572446696887, + "learning_rate": 1.7237688255091903e-05, + "loss": 0.336, + "step": 1895 + }, + { + "epoch": 5.978756884343037, + "grad_norm": 0.20824480546412605, + "learning_rate": 1.721995047347308e-05, + "loss": 0.3451, + "step": 1896 + }, + { + "epoch": 5.9819040125885135, + "grad_norm": 0.1845164093165284, + "learning_rate": 1.7202217684426717e-05, + "loss": 0.3391, + "step": 1897 + }, + { + "epoch": 5.985051140833989, + "grad_norm": 0.19512142305230165, + "learning_rate": 1.7184489906474028e-05, + "loss": 0.3343, + "step": 1898 + }, + { + "epoch": 5.988198269079465, + "grad_norm": 0.17660675867730372, + "learning_rate": 1.716676715813096e-05, + "loss": 0.3434, + "step": 1899 + }, + { + "epoch": 5.991345397324941, + "grad_norm": 0.190088158411966, + "learning_rate": 1.7149049457908243e-05, + "loss": 0.3385, + "step": 1900 + }, + { + "epoch": 5.994492525570417, + "grad_norm": 0.18592216353409444, + "learning_rate": 1.713133682431129e-05, + "loss": 0.3351, + "step": 1901 + }, + { + "epoch": 5.997639653815893, + "grad_norm": 0.18827936723885594, + "learning_rate": 1.7113629275840265e-05, + "loss": 0.3375, + "step": 1902 + }, + { + "epoch": 6.003147128245476, + "grad_norm": 0.4975044241803055, + "learning_rate": 1.7095926830989985e-05, + "loss": 0.6166, + "step": 1903 + }, + { + "epoch": 6.006294256490952, + "grad_norm": 0.3610313120288688, + "learning_rate": 1.7078229508249965e-05, + "loss": 0.276, + "step": 1904 + }, + { + "epoch": 6.009441384736428, + "grad_norm": 0.38073466389678695, + "learning_rate": 1.706053732610435e-05, + "loss": 0.2739, + "step": 1905 + }, + { + "epoch": 6.012588512981904, + "grad_norm": 0.42338349975031847, + "learning_rate": 1.704285030303192e-05, + "loss": 0.2676, + "step": 1906 + }, + { + "epoch": 6.01573564122738, + "grad_norm": 0.29695193868084807, + "learning_rate": 1.702516845750608e-05, + "loss": 0.268, + "step": 1907 + }, + { + "epoch": 6.018882769472856, + "grad_norm": 0.35818009653270233, + "learning_rate": 1.700749180799482e-05, + "loss": 0.2675, + "step": 1908 + }, + { + "epoch": 6.022029897718332, + "grad_norm": 0.29739443630667606, + "learning_rate": 1.6989820372960685e-05, + "loss": 0.2606, + "step": 1909 + }, + { + "epoch": 6.025177025963808, + "grad_norm": 0.31018644023401853, + "learning_rate": 1.6972154170860807e-05, + "loss": 0.2663, + "step": 1910 + }, + { + "epoch": 6.028324154209284, + "grad_norm": 0.3229186420829361, + "learning_rate": 1.6954493220146827e-05, + "loss": 0.2616, + "step": 1911 + }, + { + "epoch": 6.03147128245476, + "grad_norm": 0.2769240182680597, + "learning_rate": 1.6936837539264903e-05, + "loss": 0.2687, + "step": 1912 + }, + { + "epoch": 6.034618410700236, + "grad_norm": 0.27372353067836536, + "learning_rate": 1.6919187146655698e-05, + "loss": 0.2662, + "step": 1913 + }, + { + "epoch": 6.037765538945712, + "grad_norm": 0.26192196377766025, + "learning_rate": 1.690154206075435e-05, + "loss": 0.2641, + "step": 1914 + }, + { + "epoch": 6.040912667191188, + "grad_norm": 0.2671939261828635, + "learning_rate": 1.6883902299990452e-05, + "loss": 0.2705, + "step": 1915 + }, + { + "epoch": 6.044059795436664, + "grad_norm": 0.2718399605137191, + "learning_rate": 1.6866267882788042e-05, + "loss": 0.2622, + "step": 1916 + }, + { + "epoch": 6.04720692368214, + "grad_norm": 0.2548650251007784, + "learning_rate": 1.684863882756556e-05, + "loss": 0.2575, + "step": 1917 + }, + { + "epoch": 6.050354051927616, + "grad_norm": 0.24781471314240483, + "learning_rate": 1.683101515273587e-05, + "loss": 0.2626, + "step": 1918 + }, + { + "epoch": 6.053501180173092, + "grad_norm": 0.24324637074207814, + "learning_rate": 1.681339687670618e-05, + "loss": 0.2624, + "step": 1919 + }, + { + "epoch": 6.056648308418568, + "grad_norm": 0.2436540725116606, + "learning_rate": 1.679578401787811e-05, + "loss": 0.2726, + "step": 1920 + }, + { + "epoch": 6.059795436664044, + "grad_norm": 0.253809992047894, + "learning_rate": 1.6778176594647574e-05, + "loss": 0.2638, + "step": 1921 + }, + { + "epoch": 6.06294256490952, + "grad_norm": 0.24633420682616938, + "learning_rate": 1.6760574625404827e-05, + "loss": 0.2502, + "step": 1922 + }, + { + "epoch": 6.066089693154996, + "grad_norm": 0.24090371222537213, + "learning_rate": 1.674297812853444e-05, + "loss": 0.2653, + "step": 1923 + }, + { + "epoch": 6.069236821400472, + "grad_norm": 0.24757439171536427, + "learning_rate": 1.6725387122415253e-05, + "loss": 0.268, + "step": 1924 + }, + { + "epoch": 6.072383949645948, + "grad_norm": 0.22694322630286495, + "learning_rate": 1.6707801625420375e-05, + "loss": 0.2624, + "step": 1925 + }, + { + "epoch": 6.075531077891424, + "grad_norm": 0.2549701791259606, + "learning_rate": 1.669022165591716e-05, + "loss": 0.2655, + "step": 1926 + }, + { + "epoch": 6.0786782061369005, + "grad_norm": 0.2245811781702429, + "learning_rate": 1.6672647232267194e-05, + "loss": 0.2696, + "step": 1927 + }, + { + "epoch": 6.081825334382376, + "grad_norm": 0.2711918706767307, + "learning_rate": 1.6655078372826253e-05, + "loss": 0.2718, + "step": 1928 + }, + { + "epoch": 6.084972462627852, + "grad_norm": 0.23302194321945463, + "learning_rate": 1.663751509594434e-05, + "loss": 0.2649, + "step": 1929 + }, + { + "epoch": 6.088119590873328, + "grad_norm": 0.23949565312026969, + "learning_rate": 1.6619957419965582e-05, + "loss": 0.2708, + "step": 1930 + }, + { + "epoch": 6.091266719118804, + "grad_norm": 0.22495551097762426, + "learning_rate": 1.6602405363228286e-05, + "loss": 0.2643, + "step": 1931 + }, + { + "epoch": 6.09441384736428, + "grad_norm": 0.2319845411537025, + "learning_rate": 1.6584858944064874e-05, + "loss": 0.2669, + "step": 1932 + }, + { + "epoch": 6.097560975609756, + "grad_norm": 0.22569793590685644, + "learning_rate": 1.6567318180801892e-05, + "loss": 0.2726, + "step": 1933 + }, + { + "epoch": 6.100708103855232, + "grad_norm": 0.23266554290354272, + "learning_rate": 1.6549783091759972e-05, + "loss": 0.2719, + "step": 1934 + }, + { + "epoch": 6.103855232100708, + "grad_norm": 0.2114326207930978, + "learning_rate": 1.6532253695253814e-05, + "loss": 0.2631, + "step": 1935 + }, + { + "epoch": 6.1070023603461845, + "grad_norm": 0.22853221650998712, + "learning_rate": 1.651473000959219e-05, + "loss": 0.2693, + "step": 1936 + }, + { + "epoch": 6.11014948859166, + "grad_norm": 0.22091900772196124, + "learning_rate": 1.649721205307788e-05, + "loss": 0.2614, + "step": 1937 + }, + { + "epoch": 6.113296616837136, + "grad_norm": 0.21192508829344028, + "learning_rate": 1.6479699844007706e-05, + "loss": 0.2662, + "step": 1938 + }, + { + "epoch": 6.116443745082612, + "grad_norm": 0.21103131436590075, + "learning_rate": 1.646219340067248e-05, + "loss": 0.2664, + "step": 1939 + }, + { + "epoch": 6.119590873328088, + "grad_norm": 0.21105523751019395, + "learning_rate": 1.644469274135698e-05, + "loss": 0.2602, + "step": 1940 + }, + { + "epoch": 6.122738001573564, + "grad_norm": 0.1988689455993957, + "learning_rate": 1.6427197884339964e-05, + "loss": 0.2692, + "step": 1941 + }, + { + "epoch": 6.12588512981904, + "grad_norm": 0.22070030023746054, + "learning_rate": 1.6409708847894097e-05, + "loss": 0.2634, + "step": 1942 + }, + { + "epoch": 6.129032258064516, + "grad_norm": 0.21906464799285424, + "learning_rate": 1.639222565028601e-05, + "loss": 0.255, + "step": 1943 + }, + { + "epoch": 6.132179386309992, + "grad_norm": 0.21158801417145398, + "learning_rate": 1.637474830977619e-05, + "loss": 0.2745, + "step": 1944 + }, + { + "epoch": 6.1353265145554685, + "grad_norm": 0.21986221418279994, + "learning_rate": 1.6357276844619043e-05, + "loss": 0.2653, + "step": 1945 + }, + { + "epoch": 6.138473642800944, + "grad_norm": 0.21102204865676896, + "learning_rate": 1.633981127306281e-05, + "loss": 0.2689, + "step": 1946 + }, + { + "epoch": 6.14162077104642, + "grad_norm": 0.21326607733179273, + "learning_rate": 1.63223516133496e-05, + "loss": 0.2716, + "step": 1947 + }, + { + "epoch": 6.144767899291896, + "grad_norm": 0.2110880192361947, + "learning_rate": 1.6304897883715324e-05, + "loss": 0.2666, + "step": 1948 + }, + { + "epoch": 6.147915027537372, + "grad_norm": 0.22051215932311774, + "learning_rate": 1.6287450102389725e-05, + "loss": 0.2618, + "step": 1949 + }, + { + "epoch": 6.151062155782848, + "grad_norm": 0.21340487458450053, + "learning_rate": 1.6270008287596305e-05, + "loss": 0.2669, + "step": 1950 + }, + { + "epoch": 6.154209284028324, + "grad_norm": 0.2146633102609745, + "learning_rate": 1.6252572457552366e-05, + "loss": 0.2644, + "step": 1951 + }, + { + "epoch": 6.1573564122738, + "grad_norm": 0.2102265150162684, + "learning_rate": 1.6235142630468928e-05, + "loss": 0.2684, + "step": 1952 + }, + { + "epoch": 6.160503540519276, + "grad_norm": 0.21811839270757488, + "learning_rate": 1.621771882455076e-05, + "loss": 0.2686, + "step": 1953 + }, + { + "epoch": 6.1636506687647525, + "grad_norm": 0.21075077208103193, + "learning_rate": 1.6200301057996337e-05, + "loss": 0.2655, + "step": 1954 + }, + { + "epoch": 6.166797797010228, + "grad_norm": 0.218688567959288, + "learning_rate": 1.6182889348997832e-05, + "loss": 0.2652, + "step": 1955 + }, + { + "epoch": 6.169944925255704, + "grad_norm": 0.22171656079314364, + "learning_rate": 1.6165483715741075e-05, + "loss": 0.2844, + "step": 1956 + }, + { + "epoch": 6.17309205350118, + "grad_norm": 0.25323058930729025, + "learning_rate": 1.6148084176405567e-05, + "loss": 0.2708, + "step": 1957 + }, + { + "epoch": 6.176239181746656, + "grad_norm": 0.21655864066814562, + "learning_rate": 1.6130690749164437e-05, + "loss": 0.2651, + "step": 1958 + }, + { + "epoch": 6.1793863099921325, + "grad_norm": 0.21929256246949802, + "learning_rate": 1.6113303452184434e-05, + "loss": 0.2761, + "step": 1959 + }, + { + "epoch": 6.182533438237608, + "grad_norm": 0.21900714816732905, + "learning_rate": 1.6095922303625902e-05, + "loss": 0.2731, + "step": 1960 + }, + { + "epoch": 6.185680566483084, + "grad_norm": 0.23059154637063942, + "learning_rate": 1.6078547321642758e-05, + "loss": 0.2702, + "step": 1961 + }, + { + "epoch": 6.18882769472856, + "grad_norm": 0.21225513261785303, + "learning_rate": 1.6061178524382483e-05, + "loss": 0.273, + "step": 1962 + }, + { + "epoch": 6.191974822974037, + "grad_norm": 0.23067148079801209, + "learning_rate": 1.6043815929986094e-05, + "loss": 0.2749, + "step": 1963 + }, + { + "epoch": 6.195121951219512, + "grad_norm": 0.20922145207589546, + "learning_rate": 1.602645955658815e-05, + "loss": 0.2603, + "step": 1964 + }, + { + "epoch": 6.198269079464988, + "grad_norm": 0.23195506433753524, + "learning_rate": 1.600910942231668e-05, + "loss": 0.2641, + "step": 1965 + }, + { + "epoch": 6.201416207710464, + "grad_norm": 0.20849955610662246, + "learning_rate": 1.599176554529321e-05, + "loss": 0.2652, + "step": 1966 + }, + { + "epoch": 6.20456333595594, + "grad_norm": 0.22725404485600895, + "learning_rate": 1.597442794363275e-05, + "loss": 0.2714, + "step": 1967 + }, + { + "epoch": 6.2077104642014165, + "grad_norm": 0.21709605058575326, + "learning_rate": 1.595709663544372e-05, + "loss": 0.2631, + "step": 1968 + }, + { + "epoch": 6.210857592446892, + "grad_norm": 0.21948985507840815, + "learning_rate": 1.5939771638827997e-05, + "loss": 0.2659, + "step": 1969 + }, + { + "epoch": 6.214004720692368, + "grad_norm": 0.22768102749279248, + "learning_rate": 1.5922452971880848e-05, + "loss": 0.267, + "step": 1970 + }, + { + "epoch": 6.217151848937844, + "grad_norm": 0.22470980313296637, + "learning_rate": 1.5905140652690935e-05, + "loss": 0.2751, + "step": 1971 + }, + { + "epoch": 6.220298977183321, + "grad_norm": 0.21601930900048635, + "learning_rate": 1.5887834699340288e-05, + "loss": 0.2687, + "step": 1972 + }, + { + "epoch": 6.223446105428796, + "grad_norm": 0.23789145044732735, + "learning_rate": 1.587053512990431e-05, + "loss": 0.2648, + "step": 1973 + }, + { + "epoch": 6.226593233674272, + "grad_norm": 0.20645827162898345, + "learning_rate": 1.5853241962451688e-05, + "loss": 0.2656, + "step": 1974 + }, + { + "epoch": 6.229740361919748, + "grad_norm": 0.23749596259128172, + "learning_rate": 1.5835955215044466e-05, + "loss": 0.2649, + "step": 1975 + }, + { + "epoch": 6.232887490165224, + "grad_norm": 0.21313415509044598, + "learning_rate": 1.581867490573797e-05, + "loss": 0.2724, + "step": 1976 + }, + { + "epoch": 6.2360346184107005, + "grad_norm": 0.23951028716861328, + "learning_rate": 1.580140105258079e-05, + "loss": 0.2706, + "step": 1977 + }, + { + "epoch": 6.239181746656176, + "grad_norm": 0.22264880703435372, + "learning_rate": 1.5784133673614787e-05, + "loss": 0.2745, + "step": 1978 + }, + { + "epoch": 6.242328874901652, + "grad_norm": 0.21806089788197308, + "learning_rate": 1.576687278687504e-05, + "loss": 0.2714, + "step": 1979 + }, + { + "epoch": 6.245476003147128, + "grad_norm": 0.21519597916517344, + "learning_rate": 1.5749618410389884e-05, + "loss": 0.2749, + "step": 1980 + }, + { + "epoch": 6.248623131392605, + "grad_norm": 0.23270010761799506, + "learning_rate": 1.5732370562180826e-05, + "loss": 0.2656, + "step": 1981 + }, + { + "epoch": 6.25177025963808, + "grad_norm": 0.20586789811939893, + "learning_rate": 1.5715129260262556e-05, + "loss": 0.2695, + "step": 1982 + }, + { + "epoch": 6.254917387883556, + "grad_norm": 0.2437679513570022, + "learning_rate": 1.5697894522642928e-05, + "loss": 0.2748, + "step": 1983 + }, + { + "epoch": 6.258064516129032, + "grad_norm": 0.2281678596319344, + "learning_rate": 1.568066636732295e-05, + "loss": 0.2608, + "step": 1984 + }, + { + "epoch": 6.261211644374509, + "grad_norm": 0.2330467459753673, + "learning_rate": 1.566344481229674e-05, + "loss": 0.2582, + "step": 1985 + }, + { + "epoch": 6.2643587726199845, + "grad_norm": 0.21485682537512538, + "learning_rate": 1.564622987555154e-05, + "loss": 0.2753, + "step": 1986 + }, + { + "epoch": 6.26750590086546, + "grad_norm": 0.22237525288016594, + "learning_rate": 1.5629021575067662e-05, + "loss": 0.2681, + "step": 1987 + }, + { + "epoch": 6.270653029110936, + "grad_norm": 0.22728442867475201, + "learning_rate": 1.5611819928818502e-05, + "loss": 0.2714, + "step": 1988 + }, + { + "epoch": 6.273800157356412, + "grad_norm": 0.22800474162990042, + "learning_rate": 1.5594624954770494e-05, + "loss": 0.2708, + "step": 1989 + }, + { + "epoch": 6.276947285601889, + "grad_norm": 0.22113656266743126, + "learning_rate": 1.5577436670883108e-05, + "loss": 0.2673, + "step": 1990 + }, + { + "epoch": 6.280094413847364, + "grad_norm": 0.21824707276540103, + "learning_rate": 1.5560255095108824e-05, + "loss": 0.2755, + "step": 1991 + }, + { + "epoch": 6.28324154209284, + "grad_norm": 0.22677192743914226, + "learning_rate": 1.5543080245393128e-05, + "loss": 0.2695, + "step": 1992 + }, + { + "epoch": 6.286388670338316, + "grad_norm": 0.22384148599348552, + "learning_rate": 1.552591213967446e-05, + "loss": 0.2693, + "step": 1993 + }, + { + "epoch": 6.289535798583792, + "grad_norm": 0.2231213061535353, + "learning_rate": 1.5508750795884222e-05, + "loss": 0.2743, + "step": 1994 + }, + { + "epoch": 6.2926829268292686, + "grad_norm": 0.21524551643992698, + "learning_rate": 1.5491596231946764e-05, + "loss": 0.2615, + "step": 1995 + }, + { + "epoch": 6.295830055074744, + "grad_norm": 0.21550805228949638, + "learning_rate": 1.5474448465779355e-05, + "loss": 0.2721, + "step": 1996 + }, + { + "epoch": 6.29897718332022, + "grad_norm": 0.2157668033969489, + "learning_rate": 1.5457307515292152e-05, + "loss": 0.268, + "step": 1997 + }, + { + "epoch": 6.302124311565696, + "grad_norm": 0.21835492722502536, + "learning_rate": 1.5440173398388202e-05, + "loss": 0.2667, + "step": 1998 + }, + { + "epoch": 6.305271439811173, + "grad_norm": 0.2116868929153955, + "learning_rate": 1.5423046132963407e-05, + "loss": 0.2646, + "step": 1999 + }, + { + "epoch": 6.3084185680566485, + "grad_norm": 0.23996177968549967, + "learning_rate": 1.5405925736906507e-05, + "loss": 0.2681, + "step": 2000 + }, + { + "epoch": 6.311565696302124, + "grad_norm": 0.227126159207544, + "learning_rate": 1.5388812228099105e-05, + "loss": 0.268, + "step": 2001 + }, + { + "epoch": 6.3147128245476, + "grad_norm": 0.22790212437206483, + "learning_rate": 1.5371705624415566e-05, + "loss": 0.2693, + "step": 2002 + }, + { + "epoch": 6.317859952793077, + "grad_norm": 0.21628336152215077, + "learning_rate": 1.535460594372307e-05, + "loss": 0.2712, + "step": 2003 + }, + { + "epoch": 6.321007081038553, + "grad_norm": 0.2271127802784349, + "learning_rate": 1.533751320388154e-05, + "loss": 0.2687, + "step": 2004 + }, + { + "epoch": 6.324154209284028, + "grad_norm": 0.21421542101219174, + "learning_rate": 1.5320427422743685e-05, + "loss": 0.2718, + "step": 2005 + }, + { + "epoch": 6.327301337529504, + "grad_norm": 0.24451271038131359, + "learning_rate": 1.5303348618154915e-05, + "loss": 0.2623, + "step": 2006 + }, + { + "epoch": 6.33044846577498, + "grad_norm": 0.20321144790178836, + "learning_rate": 1.5286276807953365e-05, + "loss": 0.2693, + "step": 2007 + }, + { + "epoch": 6.333595594020457, + "grad_norm": 0.24302047081282138, + "learning_rate": 1.5269212009969868e-05, + "loss": 0.2725, + "step": 2008 + }, + { + "epoch": 6.3367427222659325, + "grad_norm": 0.21613725540205522, + "learning_rate": 1.5252154242027932e-05, + "loss": 0.2695, + "step": 2009 + }, + { + "epoch": 6.339889850511408, + "grad_norm": 0.20743059537745503, + "learning_rate": 1.5235103521943719e-05, + "loss": 0.2729, + "step": 2010 + }, + { + "epoch": 6.343036978756884, + "grad_norm": 0.2188470883353957, + "learning_rate": 1.5218059867526025e-05, + "loss": 0.2633, + "step": 2011 + }, + { + "epoch": 6.34618410700236, + "grad_norm": 0.2077384192612987, + "learning_rate": 1.5201023296576281e-05, + "loss": 0.2749, + "step": 2012 + }, + { + "epoch": 6.349331235247837, + "grad_norm": 0.21085206599076245, + "learning_rate": 1.5183993826888506e-05, + "loss": 0.28, + "step": 2013 + }, + { + "epoch": 6.352478363493312, + "grad_norm": 0.20963113069856626, + "learning_rate": 1.5166971476249299e-05, + "loss": 0.2699, + "step": 2014 + }, + { + "epoch": 6.355625491738788, + "grad_norm": 0.21152418642222406, + "learning_rate": 1.5149956262437848e-05, + "loss": 0.2691, + "step": 2015 + }, + { + "epoch": 6.358772619984264, + "grad_norm": 0.21798400408864652, + "learning_rate": 1.5132948203225866e-05, + "loss": 0.2701, + "step": 2016 + }, + { + "epoch": 6.361919748229741, + "grad_norm": 0.212384884342032, + "learning_rate": 1.5115947316377591e-05, + "loss": 0.2714, + "step": 2017 + }, + { + "epoch": 6.3650668764752165, + "grad_norm": 0.21764729442192574, + "learning_rate": 1.5098953619649779e-05, + "loss": 0.2706, + "step": 2018 + }, + { + "epoch": 6.368214004720692, + "grad_norm": 0.20976149383277362, + "learning_rate": 1.5081967130791672e-05, + "loss": 0.2715, + "step": 2019 + }, + { + "epoch": 6.371361132966168, + "grad_norm": 0.21234247868680792, + "learning_rate": 1.5064987867544982e-05, + "loss": 0.2665, + "step": 2020 + }, + { + "epoch": 6.374508261211645, + "grad_norm": 0.21051751829878967, + "learning_rate": 1.5048015847643887e-05, + "loss": 0.2672, + "step": 2021 + }, + { + "epoch": 6.377655389457121, + "grad_norm": 0.20516874410918237, + "learning_rate": 1.5031051088814982e-05, + "loss": 0.2634, + "step": 2022 + }, + { + "epoch": 6.380802517702596, + "grad_norm": 0.1999457276853811, + "learning_rate": 1.5014093608777294e-05, + "loss": 0.2738, + "step": 2023 + }, + { + "epoch": 6.383949645948072, + "grad_norm": 0.21305496050001185, + "learning_rate": 1.4997143425242229e-05, + "loss": 0.2737, + "step": 2024 + }, + { + "epoch": 6.387096774193548, + "grad_norm": 0.20672780819338069, + "learning_rate": 1.4980200555913586e-05, + "loss": 0.2718, + "step": 2025 + }, + { + "epoch": 6.390243902439025, + "grad_norm": 0.21450892009300504, + "learning_rate": 1.4963265018487523e-05, + "loss": 0.2654, + "step": 2026 + }, + { + "epoch": 6.3933910306845005, + "grad_norm": 0.20320195505647742, + "learning_rate": 1.4946336830652533e-05, + "loss": 0.2658, + "step": 2027 + }, + { + "epoch": 6.396538158929976, + "grad_norm": 0.20392713465924406, + "learning_rate": 1.492941601008945e-05, + "loss": 0.2746, + "step": 2028 + }, + { + "epoch": 6.399685287175452, + "grad_norm": 0.20959876954254805, + "learning_rate": 1.4912502574471384e-05, + "loss": 0.2747, + "step": 2029 + }, + { + "epoch": 6.402832415420928, + "grad_norm": 0.2108786736914976, + "learning_rate": 1.4895596541463771e-05, + "loss": 0.2701, + "step": 2030 + }, + { + "epoch": 6.405979543666405, + "grad_norm": 0.22118213608216855, + "learning_rate": 1.4878697928724273e-05, + "loss": 0.272, + "step": 2031 + }, + { + "epoch": 6.4091266719118805, + "grad_norm": 0.203568518365141, + "learning_rate": 1.486180675390283e-05, + "loss": 0.2659, + "step": 2032 + }, + { + "epoch": 6.412273800157356, + "grad_norm": 0.2227618023805204, + "learning_rate": 1.484492303464161e-05, + "loss": 0.2717, + "step": 2033 + }, + { + "epoch": 6.415420928402832, + "grad_norm": 0.21398359499500041, + "learning_rate": 1.482804678857498e-05, + "loss": 0.2648, + "step": 2034 + }, + { + "epoch": 6.418568056648309, + "grad_norm": 0.22308492477910258, + "learning_rate": 1.4811178033329516e-05, + "loss": 0.2642, + "step": 2035 + }, + { + "epoch": 6.421715184893785, + "grad_norm": 0.21249595991484213, + "learning_rate": 1.4794316786523962e-05, + "loss": 0.2683, + "step": 2036 + }, + { + "epoch": 6.42486231313926, + "grad_norm": 0.21068800662577022, + "learning_rate": 1.4777463065769224e-05, + "loss": 0.2701, + "step": 2037 + }, + { + "epoch": 6.428009441384736, + "grad_norm": 0.21437199711857782, + "learning_rate": 1.4760616888668353e-05, + "loss": 0.2747, + "step": 2038 + }, + { + "epoch": 6.431156569630213, + "grad_norm": 0.22330541645842986, + "learning_rate": 1.4743778272816504e-05, + "loss": 0.2704, + "step": 2039 + }, + { + "epoch": 6.434303697875689, + "grad_norm": 0.2062736384947792, + "learning_rate": 1.4726947235800952e-05, + "loss": 0.272, + "step": 2040 + }, + { + "epoch": 6.4374508261211645, + "grad_norm": 0.22521586255432074, + "learning_rate": 1.4710123795201039e-05, + "loss": 0.278, + "step": 2041 + }, + { + "epoch": 6.44059795436664, + "grad_norm": 0.203912107820357, + "learning_rate": 1.4693307968588194e-05, + "loss": 0.2711, + "step": 2042 + }, + { + "epoch": 6.443745082612116, + "grad_norm": 0.21931119328761017, + "learning_rate": 1.4676499773525863e-05, + "loss": 0.2663, + "step": 2043 + }, + { + "epoch": 6.446892210857593, + "grad_norm": 0.2094814854379808, + "learning_rate": 1.4659699227569566e-05, + "loss": 0.2695, + "step": 2044 + }, + { + "epoch": 6.450039339103069, + "grad_norm": 0.23117534470233453, + "learning_rate": 1.464290634826679e-05, + "loss": 0.2692, + "step": 2045 + }, + { + "epoch": 6.453186467348544, + "grad_norm": 0.21379505139811097, + "learning_rate": 1.4626121153157046e-05, + "loss": 0.2768, + "step": 2046 + }, + { + "epoch": 6.45633359559402, + "grad_norm": 0.217663110855076, + "learning_rate": 1.4609343659771793e-05, + "loss": 0.2713, + "step": 2047 + }, + { + "epoch": 6.459480723839496, + "grad_norm": 0.2040638494691249, + "learning_rate": 1.4592573885634464e-05, + "loss": 0.2654, + "step": 2048 + }, + { + "epoch": 6.462627852084973, + "grad_norm": 0.2303747870401908, + "learning_rate": 1.4575811848260429e-05, + "loss": 0.2749, + "step": 2049 + }, + { + "epoch": 6.4657749803304485, + "grad_norm": 0.20958637817569242, + "learning_rate": 1.4559057565156964e-05, + "loss": 0.2708, + "step": 2050 + }, + { + "epoch": 6.468922108575924, + "grad_norm": 0.2224204366720408, + "learning_rate": 1.4542311053823257e-05, + "loss": 0.2748, + "step": 2051 + }, + { + "epoch": 6.4720692368214, + "grad_norm": 0.21278520060079634, + "learning_rate": 1.4525572331750373e-05, + "loss": 0.2674, + "step": 2052 + }, + { + "epoch": 6.475216365066877, + "grad_norm": 0.21995573974288046, + "learning_rate": 1.4508841416421256e-05, + "loss": 0.2696, + "step": 2053 + }, + { + "epoch": 6.478363493312353, + "grad_norm": 0.222067909671715, + "learning_rate": 1.4492118325310673e-05, + "loss": 0.2735, + "step": 2054 + }, + { + "epoch": 6.481510621557828, + "grad_norm": 0.21469400060674615, + "learning_rate": 1.4475403075885233e-05, + "loss": 0.2738, + "step": 2055 + }, + { + "epoch": 6.484657749803304, + "grad_norm": 0.2220259001976218, + "learning_rate": 1.445869568560335e-05, + "loss": 0.2655, + "step": 2056 + }, + { + "epoch": 6.487804878048781, + "grad_norm": 0.22065015324342255, + "learning_rate": 1.4441996171915241e-05, + "loss": 0.2703, + "step": 2057 + }, + { + "epoch": 6.490952006294257, + "grad_norm": 0.2233908978556124, + "learning_rate": 1.4425304552262876e-05, + "loss": 0.2749, + "step": 2058 + }, + { + "epoch": 6.4940991345397325, + "grad_norm": 0.21860192593438782, + "learning_rate": 1.4408620844079998e-05, + "loss": 0.2691, + "step": 2059 + }, + { + "epoch": 6.497246262785208, + "grad_norm": 0.21630356299188297, + "learning_rate": 1.4391945064792076e-05, + "loss": 0.2699, + "step": 2060 + }, + { + "epoch": 6.500393391030684, + "grad_norm": 0.20468468546139096, + "learning_rate": 1.4375277231816309e-05, + "loss": 0.2659, + "step": 2061 + }, + { + "epoch": 6.503540519276161, + "grad_norm": 0.2266198893281483, + "learning_rate": 1.435861736256158e-05, + "loss": 0.2636, + "step": 2062 + }, + { + "epoch": 6.506687647521637, + "grad_norm": 0.210355804203251, + "learning_rate": 1.4341965474428463e-05, + "loss": 0.281, + "step": 2063 + }, + { + "epoch": 6.5098347757671124, + "grad_norm": 0.2104007529783895, + "learning_rate": 1.4325321584809193e-05, + "loss": 0.2745, + "step": 2064 + }, + { + "epoch": 6.512981904012588, + "grad_norm": 0.21454334831641367, + "learning_rate": 1.4308685711087664e-05, + "loss": 0.2714, + "step": 2065 + }, + { + "epoch": 6.516129032258064, + "grad_norm": 0.20461473363605256, + "learning_rate": 1.4292057870639387e-05, + "loss": 0.2737, + "step": 2066 + }, + { + "epoch": 6.519276160503541, + "grad_norm": 0.22229813634194787, + "learning_rate": 1.4275438080831468e-05, + "loss": 0.2713, + "step": 2067 + }, + { + "epoch": 6.522423288749017, + "grad_norm": 0.20801329570201357, + "learning_rate": 1.4258826359022639e-05, + "loss": 0.2664, + "step": 2068 + }, + { + "epoch": 6.525570416994492, + "grad_norm": 0.2141355735304912, + "learning_rate": 1.4242222722563166e-05, + "loss": 0.2692, + "step": 2069 + }, + { + "epoch": 6.528717545239968, + "grad_norm": 0.21674575849736738, + "learning_rate": 1.4225627188794913e-05, + "loss": 0.2735, + "step": 2070 + }, + { + "epoch": 6.531864673485445, + "grad_norm": 0.23378921154557367, + "learning_rate": 1.4209039775051233e-05, + "loss": 0.2779, + "step": 2071 + }, + { + "epoch": 6.535011801730921, + "grad_norm": 0.20405908727514255, + "learning_rate": 1.4192460498657035e-05, + "loss": 0.2778, + "step": 2072 + }, + { + "epoch": 6.5381589299763965, + "grad_norm": 0.2167015673737203, + "learning_rate": 1.4175889376928717e-05, + "loss": 0.2674, + "step": 2073 + }, + { + "epoch": 6.541306058221872, + "grad_norm": 0.22602621445184934, + "learning_rate": 1.415932642717416e-05, + "loss": 0.2776, + "step": 2074 + }, + { + "epoch": 6.544453186467349, + "grad_norm": 0.2103436914815303, + "learning_rate": 1.4142771666692716e-05, + "loss": 0.2748, + "step": 2075 + }, + { + "epoch": 6.547600314712825, + "grad_norm": 0.22284944224802702, + "learning_rate": 1.4126225112775163e-05, + "loss": 0.2703, + "step": 2076 + }, + { + "epoch": 6.550747442958301, + "grad_norm": 0.20859337785195634, + "learning_rate": 1.4109686782703729e-05, + "loss": 0.2751, + "step": 2077 + }, + { + "epoch": 6.553894571203776, + "grad_norm": 0.20898420028915374, + "learning_rate": 1.4093156693752041e-05, + "loss": 0.2722, + "step": 2078 + }, + { + "epoch": 6.557041699449252, + "grad_norm": 0.21921481305234924, + "learning_rate": 1.407663486318513e-05, + "loss": 0.2743, + "step": 2079 + }, + { + "epoch": 6.560188827694729, + "grad_norm": 0.19770871033798984, + "learning_rate": 1.4060121308259386e-05, + "loss": 0.2682, + "step": 2080 + }, + { + "epoch": 6.563335955940205, + "grad_norm": 0.20251135019019187, + "learning_rate": 1.4043616046222562e-05, + "loss": 0.2796, + "step": 2081 + }, + { + "epoch": 6.5664830841856805, + "grad_norm": 0.20589764192052976, + "learning_rate": 1.4027119094313766e-05, + "loss": 0.268, + "step": 2082 + }, + { + "epoch": 6.569630212431156, + "grad_norm": 0.20962471096621335, + "learning_rate": 1.4010630469763386e-05, + "loss": 0.2689, + "step": 2083 + }, + { + "epoch": 6.572777340676632, + "grad_norm": 0.2077048133726809, + "learning_rate": 1.3994150189793165e-05, + "loss": 0.2666, + "step": 2084 + }, + { + "epoch": 6.575924468922109, + "grad_norm": 0.21367970260930136, + "learning_rate": 1.397767827161608e-05, + "loss": 0.2668, + "step": 2085 + }, + { + "epoch": 6.579071597167585, + "grad_norm": 0.20369044452822496, + "learning_rate": 1.3961214732436407e-05, + "loss": 0.2717, + "step": 2086 + }, + { + "epoch": 6.58221872541306, + "grad_norm": 0.20608659783474267, + "learning_rate": 1.3944759589449657e-05, + "loss": 0.2662, + "step": 2087 + }, + { + "epoch": 6.585365853658536, + "grad_norm": 0.21167921476702603, + "learning_rate": 1.3928312859842592e-05, + "loss": 0.2548, + "step": 2088 + }, + { + "epoch": 6.588512981904013, + "grad_norm": 0.2119572678076718, + "learning_rate": 1.3911874560793149e-05, + "loss": 0.2686, + "step": 2089 + }, + { + "epoch": 6.591660110149489, + "grad_norm": 0.23046584602252454, + "learning_rate": 1.3895444709470485e-05, + "loss": 0.2691, + "step": 2090 + }, + { + "epoch": 6.5948072383949645, + "grad_norm": 0.2152670153636726, + "learning_rate": 1.387902332303494e-05, + "loss": 0.2789, + "step": 2091 + }, + { + "epoch": 6.59795436664044, + "grad_norm": 0.21392485708596343, + "learning_rate": 1.3862610418637988e-05, + "loss": 0.276, + "step": 2092 + }, + { + "epoch": 6.601101494885917, + "grad_norm": 0.2251993808341726, + "learning_rate": 1.384620601342227e-05, + "loss": 0.2731, + "step": 2093 + }, + { + "epoch": 6.604248623131393, + "grad_norm": 0.2105054752455257, + "learning_rate": 1.3829810124521528e-05, + "loss": 0.2712, + "step": 2094 + }, + { + "epoch": 6.607395751376869, + "grad_norm": 0.22197663382801497, + "learning_rate": 1.3813422769060628e-05, + "loss": 0.269, + "step": 2095 + }, + { + "epoch": 6.610542879622344, + "grad_norm": 0.21251503803497962, + "learning_rate": 1.37970439641555e-05, + "loss": 0.2673, + "step": 2096 + }, + { + "epoch": 6.61369000786782, + "grad_norm": 0.22088548865063007, + "learning_rate": 1.3780673726913168e-05, + "loss": 0.2741, + "step": 2097 + }, + { + "epoch": 6.616837136113297, + "grad_norm": 0.2100559518908244, + "learning_rate": 1.37643120744317e-05, + "loss": 0.2809, + "step": 2098 + }, + { + "epoch": 6.619984264358773, + "grad_norm": 0.21846715779185052, + "learning_rate": 1.3747959023800181e-05, + "loss": 0.2712, + "step": 2099 + }, + { + "epoch": 6.6231313926042485, + "grad_norm": 0.2051793253015262, + "learning_rate": 1.3731614592098735e-05, + "loss": 0.274, + "step": 2100 + }, + { + "epoch": 6.626278520849724, + "grad_norm": 0.21998385302968826, + "learning_rate": 1.3715278796398468e-05, + "loss": 0.2668, + "step": 2101 + }, + { + "epoch": 6.6294256490952, + "grad_norm": 0.20524755125389635, + "learning_rate": 1.3698951653761487e-05, + "loss": 0.2726, + "step": 2102 + }, + { + "epoch": 6.632572777340677, + "grad_norm": 0.21310125591169676, + "learning_rate": 1.3682633181240826e-05, + "loss": 0.2731, + "step": 2103 + }, + { + "epoch": 6.635719905586153, + "grad_norm": 0.20373328332909574, + "learning_rate": 1.3666323395880493e-05, + "loss": 0.2786, + "step": 2104 + }, + { + "epoch": 6.6388670338316285, + "grad_norm": 0.2194259189312518, + "learning_rate": 1.3650022314715412e-05, + "loss": 0.271, + "step": 2105 + }, + { + "epoch": 6.642014162077104, + "grad_norm": 0.2044629306655923, + "learning_rate": 1.3633729954771414e-05, + "loss": 0.2768, + "step": 2106 + }, + { + "epoch": 6.645161290322581, + "grad_norm": 0.2071870556335287, + "learning_rate": 1.3617446333065234e-05, + "loss": 0.2742, + "step": 2107 + }, + { + "epoch": 6.648308418568057, + "grad_norm": 0.20950878944876208, + "learning_rate": 1.3601171466604452e-05, + "loss": 0.274, + "step": 2108 + }, + { + "epoch": 6.651455546813533, + "grad_norm": 0.2048060795347047, + "learning_rate": 1.3584905372387542e-05, + "loss": 0.2744, + "step": 2109 + }, + { + "epoch": 6.654602675059008, + "grad_norm": 0.20307264689129167, + "learning_rate": 1.356864806740378e-05, + "loss": 0.2718, + "step": 2110 + }, + { + "epoch": 6.657749803304485, + "grad_norm": 0.20870683975446555, + "learning_rate": 1.3552399568633287e-05, + "loss": 0.2751, + "step": 2111 + }, + { + "epoch": 6.660896931549961, + "grad_norm": 0.20074881013703952, + "learning_rate": 1.3536159893046969e-05, + "loss": 0.2724, + "step": 2112 + }, + { + "epoch": 6.664044059795437, + "grad_norm": 0.20014382071858103, + "learning_rate": 1.3519929057606526e-05, + "loss": 0.2693, + "step": 2113 + }, + { + "epoch": 6.6671911880409125, + "grad_norm": 0.20804817092817526, + "learning_rate": 1.3503707079264432e-05, + "loss": 0.274, + "step": 2114 + }, + { + "epoch": 6.670338316286388, + "grad_norm": 0.20493619242075548, + "learning_rate": 1.348749397496388e-05, + "loss": 0.2769, + "step": 2115 + }, + { + "epoch": 6.673485444531865, + "grad_norm": 0.2099027610431233, + "learning_rate": 1.3471289761638842e-05, + "loss": 0.2764, + "step": 2116 + }, + { + "epoch": 6.676632572777341, + "grad_norm": 0.193445232337728, + "learning_rate": 1.345509445621396e-05, + "loss": 0.2695, + "step": 2117 + }, + { + "epoch": 6.679779701022817, + "grad_norm": 0.2096470666573266, + "learning_rate": 1.34389080756046e-05, + "loss": 0.2829, + "step": 2118 + }, + { + "epoch": 6.682926829268292, + "grad_norm": 0.20576106019634247, + "learning_rate": 1.342273063671678e-05, + "loss": 0.2756, + "step": 2119 + }, + { + "epoch": 6.686073957513768, + "grad_norm": 0.21501849543327453, + "learning_rate": 1.3406562156447211e-05, + "loss": 0.2727, + "step": 2120 + }, + { + "epoch": 6.689221085759245, + "grad_norm": 0.1993115840566364, + "learning_rate": 1.339040265168322e-05, + "loss": 0.2663, + "step": 2121 + }, + { + "epoch": 6.692368214004721, + "grad_norm": 0.21226796949929252, + "learning_rate": 1.337425213930277e-05, + "loss": 0.2708, + "step": 2122 + }, + { + "epoch": 6.6955153422501965, + "grad_norm": 0.20469216116105463, + "learning_rate": 1.3358110636174443e-05, + "loss": 0.277, + "step": 2123 + }, + { + "epoch": 6.698662470495672, + "grad_norm": 0.21150553812629214, + "learning_rate": 1.3341978159157388e-05, + "loss": 0.2726, + "step": 2124 + }, + { + "epoch": 6.701809598741149, + "grad_norm": 0.2009979298846876, + "learning_rate": 1.3325854725101346e-05, + "loss": 0.2742, + "step": 2125 + }, + { + "epoch": 6.704956726986625, + "grad_norm": 0.20585970570515025, + "learning_rate": 1.3309740350846597e-05, + "loss": 0.2712, + "step": 2126 + }, + { + "epoch": 6.708103855232101, + "grad_norm": 0.20267136318362497, + "learning_rate": 1.3293635053223976e-05, + "loss": 0.2768, + "step": 2127 + }, + { + "epoch": 6.711250983477576, + "grad_norm": 0.1966034773897379, + "learning_rate": 1.3277538849054818e-05, + "loss": 0.2685, + "step": 2128 + }, + { + "epoch": 6.714398111723053, + "grad_norm": 0.19084672362126345, + "learning_rate": 1.326145175515098e-05, + "loss": 0.2707, + "step": 2129 + }, + { + "epoch": 6.717545239968529, + "grad_norm": 0.21151915236025404, + "learning_rate": 1.324537378831479e-05, + "loss": 0.2762, + "step": 2130 + }, + { + "epoch": 6.720692368214005, + "grad_norm": 0.19161947045159958, + "learning_rate": 1.3229304965339052e-05, + "loss": 0.272, + "step": 2131 + }, + { + "epoch": 6.7238394964594805, + "grad_norm": 0.2057037763555039, + "learning_rate": 1.3213245303007018e-05, + "loss": 0.2731, + "step": 2132 + }, + { + "epoch": 6.726986624704956, + "grad_norm": 0.20303632318163164, + "learning_rate": 1.3197194818092359e-05, + "loss": 0.2773, + "step": 2133 + }, + { + "epoch": 6.730133752950433, + "grad_norm": 0.19044415445481397, + "learning_rate": 1.318115352735918e-05, + "loss": 0.2793, + "step": 2134 + }, + { + "epoch": 6.733280881195909, + "grad_norm": 0.2073975268632104, + "learning_rate": 1.3165121447561968e-05, + "loss": 0.2683, + "step": 2135 + }, + { + "epoch": 6.736428009441385, + "grad_norm": 0.2133081665575485, + "learning_rate": 1.3149098595445604e-05, + "loss": 0.2742, + "step": 2136 + }, + { + "epoch": 6.7395751376868605, + "grad_norm": 0.21038693316237772, + "learning_rate": 1.313308498774531e-05, + "loss": 0.2712, + "step": 2137 + }, + { + "epoch": 6.742722265932336, + "grad_norm": 0.21412237772602838, + "learning_rate": 1.3117080641186672e-05, + "loss": 0.2765, + "step": 2138 + }, + { + "epoch": 6.745869394177813, + "grad_norm": 0.20678305330276317, + "learning_rate": 1.3101085572485603e-05, + "loss": 0.2688, + "step": 2139 + }, + { + "epoch": 6.749016522423289, + "grad_norm": 0.21245293744715033, + "learning_rate": 1.3085099798348306e-05, + "loss": 0.2718, + "step": 2140 + }, + { + "epoch": 6.752163650668765, + "grad_norm": 0.20398550728736917, + "learning_rate": 1.3069123335471301e-05, + "loss": 0.2714, + "step": 2141 + }, + { + "epoch": 6.755310778914241, + "grad_norm": 0.2031385438667128, + "learning_rate": 1.3053156200541364e-05, + "loss": 0.2699, + "step": 2142 + }, + { + "epoch": 6.758457907159717, + "grad_norm": 0.19773088989378942, + "learning_rate": 1.303719841023553e-05, + "loss": 0.2635, + "step": 2143 + }, + { + "epoch": 6.761605035405193, + "grad_norm": 0.21395849634573397, + "learning_rate": 1.3021249981221086e-05, + "loss": 0.2771, + "step": 2144 + }, + { + "epoch": 6.764752163650669, + "grad_norm": 0.19961802457486216, + "learning_rate": 1.3005310930155544e-05, + "loss": 0.2709, + "step": 2145 + }, + { + "epoch": 6.7678992918961445, + "grad_norm": 0.21134776492595922, + "learning_rate": 1.2989381273686597e-05, + "loss": 0.2669, + "step": 2146 + }, + { + "epoch": 6.771046420141621, + "grad_norm": 0.19864060482042745, + "learning_rate": 1.2973461028452144e-05, + "loss": 0.2706, + "step": 2147 + }, + { + "epoch": 6.774193548387097, + "grad_norm": 0.2044619678907636, + "learning_rate": 1.2957550211080259e-05, + "loss": 0.2739, + "step": 2148 + }, + { + "epoch": 6.777340676632573, + "grad_norm": 0.21504368793018358, + "learning_rate": 1.2941648838189147e-05, + "loss": 0.2674, + "step": 2149 + }, + { + "epoch": 6.780487804878049, + "grad_norm": 0.20378150368432318, + "learning_rate": 1.2925756926387177e-05, + "loss": 0.2696, + "step": 2150 + }, + { + "epoch": 6.783634933123524, + "grad_norm": 0.20600148728967427, + "learning_rate": 1.2909874492272807e-05, + "loss": 0.2802, + "step": 2151 + }, + { + "epoch": 6.786782061369001, + "grad_norm": 0.19938138549196283, + "learning_rate": 1.2894001552434626e-05, + "loss": 0.2759, + "step": 2152 + }, + { + "epoch": 6.789929189614477, + "grad_norm": 0.21154201382497265, + "learning_rate": 1.2878138123451274e-05, + "loss": 0.2731, + "step": 2153 + }, + { + "epoch": 6.793076317859953, + "grad_norm": 0.19784577469812065, + "learning_rate": 1.2862284221891485e-05, + "loss": 0.2763, + "step": 2154 + }, + { + "epoch": 6.7962234461054285, + "grad_norm": 0.21310912362182374, + "learning_rate": 1.2846439864314037e-05, + "loss": 0.2761, + "step": 2155 + }, + { + "epoch": 6.799370574350904, + "grad_norm": 0.20695979034558215, + "learning_rate": 1.283060506726772e-05, + "loss": 0.2774, + "step": 2156 + }, + { + "epoch": 6.802517702596381, + "grad_norm": 0.21261086091250228, + "learning_rate": 1.2814779847291367e-05, + "loss": 0.2758, + "step": 2157 + }, + { + "epoch": 6.805664830841857, + "grad_norm": 0.20979015880101165, + "learning_rate": 1.2798964220913772e-05, + "loss": 0.2804, + "step": 2158 + }, + { + "epoch": 6.808811959087333, + "grad_norm": 0.21761044939140664, + "learning_rate": 1.278315820465376e-05, + "loss": 0.2769, + "step": 2159 + }, + { + "epoch": 6.811959087332809, + "grad_norm": 0.2014697385726117, + "learning_rate": 1.2767361815020065e-05, + "loss": 0.2783, + "step": 2160 + }, + { + "epoch": 6.815106215578285, + "grad_norm": 0.21283798980232946, + "learning_rate": 1.2751575068511408e-05, + "loss": 0.2657, + "step": 2161 + }, + { + "epoch": 6.818253343823761, + "grad_norm": 0.19962011480967196, + "learning_rate": 1.2735797981616407e-05, + "loss": 0.2806, + "step": 2162 + }, + { + "epoch": 6.821400472069237, + "grad_norm": 0.21924693121937547, + "learning_rate": 1.2720030570813608e-05, + "loss": 0.2746, + "step": 2163 + }, + { + "epoch": 6.8245476003147125, + "grad_norm": 0.19657838694235807, + "learning_rate": 1.2704272852571455e-05, + "loss": 0.2684, + "step": 2164 + }, + { + "epoch": 6.827694728560189, + "grad_norm": 0.2030249463511617, + "learning_rate": 1.2688524843348252e-05, + "loss": 0.2722, + "step": 2165 + }, + { + "epoch": 6.830841856805665, + "grad_norm": 0.20062136834665203, + "learning_rate": 1.2672786559592178e-05, + "loss": 0.2722, + "step": 2166 + }, + { + "epoch": 6.833988985051141, + "grad_norm": 0.21259980872470255, + "learning_rate": 1.2657058017741237e-05, + "loss": 0.2746, + "step": 2167 + }, + { + "epoch": 6.837136113296617, + "grad_norm": 0.18743513115346688, + "learning_rate": 1.2641339234223282e-05, + "loss": 0.2695, + "step": 2168 + }, + { + "epoch": 6.840283241542092, + "grad_norm": 0.20138411396722927, + "learning_rate": 1.2625630225455946e-05, + "loss": 0.2764, + "step": 2169 + }, + { + "epoch": 6.843430369787569, + "grad_norm": 0.20724943586989145, + "learning_rate": 1.2609931007846672e-05, + "loss": 0.28, + "step": 2170 + }, + { + "epoch": 6.846577498033045, + "grad_norm": 0.20364398935190642, + "learning_rate": 1.2594241597792678e-05, + "loss": 0.2742, + "step": 2171 + }, + { + "epoch": 6.849724626278521, + "grad_norm": 0.21697947841842968, + "learning_rate": 1.2578562011680914e-05, + "loss": 0.2722, + "step": 2172 + }, + { + "epoch": 6.8528717545239966, + "grad_norm": 0.20438816521590877, + "learning_rate": 1.2562892265888116e-05, + "loss": 0.2742, + "step": 2173 + }, + { + "epoch": 6.856018882769473, + "grad_norm": 0.2160342522382541, + "learning_rate": 1.2547232376780687e-05, + "loss": 0.2757, + "step": 2174 + }, + { + "epoch": 6.859166011014949, + "grad_norm": 0.20593020428643655, + "learning_rate": 1.2531582360714775e-05, + "loss": 0.2675, + "step": 2175 + }, + { + "epoch": 6.862313139260425, + "grad_norm": 0.21437695083001138, + "learning_rate": 1.251594223403619e-05, + "loss": 0.2693, + "step": 2176 + }, + { + "epoch": 6.865460267505901, + "grad_norm": 0.2119697416305465, + "learning_rate": 1.2500312013080444e-05, + "loss": 0.2669, + "step": 2177 + }, + { + "epoch": 6.868607395751377, + "grad_norm": 0.2060511460509206, + "learning_rate": 1.2484691714172663e-05, + "loss": 0.2861, + "step": 2178 + }, + { + "epoch": 6.871754523996853, + "grad_norm": 0.19849969915264076, + "learning_rate": 1.246908135362764e-05, + "loss": 0.2758, + "step": 2179 + }, + { + "epoch": 6.874901652242329, + "grad_norm": 0.20718364482758267, + "learning_rate": 1.2453480947749785e-05, + "loss": 0.2746, + "step": 2180 + }, + { + "epoch": 6.878048780487805, + "grad_norm": 0.2009159233212597, + "learning_rate": 1.2437890512833089e-05, + "loss": 0.2804, + "step": 2181 + }, + { + "epoch": 6.881195908733281, + "grad_norm": 0.20361754932690013, + "learning_rate": 1.2422310065161162e-05, + "loss": 0.265, + "step": 2182 + }, + { + "epoch": 6.884343036978757, + "grad_norm": 0.20703457454924554, + "learning_rate": 1.240673962100715e-05, + "loss": 0.2686, + "step": 2183 + }, + { + "epoch": 6.887490165224233, + "grad_norm": 0.20068143636302968, + "learning_rate": 1.2391179196633776e-05, + "loss": 0.2763, + "step": 2184 + }, + { + "epoch": 6.890637293469709, + "grad_norm": 0.20358246660033522, + "learning_rate": 1.2375628808293274e-05, + "loss": 0.2792, + "step": 2185 + }, + { + "epoch": 6.893784421715185, + "grad_norm": 0.2036764238431656, + "learning_rate": 1.2360088472227418e-05, + "loss": 0.2737, + "step": 2186 + }, + { + "epoch": 6.8969315499606605, + "grad_norm": 0.20147681379716198, + "learning_rate": 1.2344558204667475e-05, + "loss": 0.2725, + "step": 2187 + }, + { + "epoch": 6.900078678206137, + "grad_norm": 0.2071864222789661, + "learning_rate": 1.2329038021834193e-05, + "loss": 0.2709, + "step": 2188 + }, + { + "epoch": 6.903225806451613, + "grad_norm": 0.2043649714329845, + "learning_rate": 1.231352793993779e-05, + "loss": 0.2738, + "step": 2189 + }, + { + "epoch": 6.906372934697089, + "grad_norm": 0.20125299342080424, + "learning_rate": 1.2298027975177926e-05, + "loss": 0.2636, + "step": 2190 + }, + { + "epoch": 6.909520062942565, + "grad_norm": 0.20814571779048338, + "learning_rate": 1.2282538143743712e-05, + "loss": 0.2771, + "step": 2191 + }, + { + "epoch": 6.912667191188041, + "grad_norm": 0.20030130841003688, + "learning_rate": 1.2267058461813649e-05, + "loss": 0.2694, + "step": 2192 + }, + { + "epoch": 6.915814319433517, + "grad_norm": 0.21200310708703735, + "learning_rate": 1.2251588945555666e-05, + "loss": 0.2725, + "step": 2193 + }, + { + "epoch": 6.918961447678993, + "grad_norm": 0.20922436094459726, + "learning_rate": 1.2236129611127045e-05, + "loss": 0.2726, + "step": 2194 + }, + { + "epoch": 6.922108575924469, + "grad_norm": 0.19704648497715532, + "learning_rate": 1.2220680474674458e-05, + "loss": 0.2741, + "step": 2195 + }, + { + "epoch": 6.925255704169945, + "grad_norm": 0.21570986491875785, + "learning_rate": 1.2205241552333922e-05, + "loss": 0.2716, + "step": 2196 + }, + { + "epoch": 6.928402832415421, + "grad_norm": 0.20000918248634075, + "learning_rate": 1.218981286023077e-05, + "loss": 0.2791, + "step": 2197 + }, + { + "epoch": 6.931549960660897, + "grad_norm": 0.1997584203723108, + "learning_rate": 1.2174394414479667e-05, + "loss": 0.2783, + "step": 2198 + }, + { + "epoch": 6.934697088906373, + "grad_norm": 0.20118408426733328, + "learning_rate": 1.215898623118456e-05, + "loss": 0.2736, + "step": 2199 + }, + { + "epoch": 6.937844217151849, + "grad_norm": 0.19493904766515557, + "learning_rate": 1.2143588326438697e-05, + "loss": 0.2734, + "step": 2200 + }, + { + "epoch": 6.940991345397325, + "grad_norm": 0.21288261790118718, + "learning_rate": 1.2128200716324566e-05, + "loss": 0.2768, + "step": 2201 + }, + { + "epoch": 6.944138473642801, + "grad_norm": 0.1976856689237112, + "learning_rate": 1.2112823416913936e-05, + "loss": 0.2747, + "step": 2202 + }, + { + "epoch": 6.947285601888277, + "grad_norm": 0.2130261035735836, + "learning_rate": 1.2097456444267771e-05, + "loss": 0.2677, + "step": 2203 + }, + { + "epoch": 6.950432730133753, + "grad_norm": 0.19662526671296285, + "learning_rate": 1.208209981443627e-05, + "loss": 0.2717, + "step": 2204 + }, + { + "epoch": 6.9535798583792285, + "grad_norm": 0.20222706552909867, + "learning_rate": 1.2066753543458835e-05, + "loss": 0.2711, + "step": 2205 + }, + { + "epoch": 6.956726986624705, + "grad_norm": 0.2069577717265713, + "learning_rate": 1.2051417647364021e-05, + "loss": 0.2793, + "step": 2206 + }, + { + "epoch": 6.959874114870181, + "grad_norm": 0.203212313831048, + "learning_rate": 1.2036092142169582e-05, + "loss": 0.2763, + "step": 2207 + }, + { + "epoch": 6.963021243115657, + "grad_norm": 0.20509305087697424, + "learning_rate": 1.2020777043882386e-05, + "loss": 0.2759, + "step": 2208 + }, + { + "epoch": 6.966168371361133, + "grad_norm": 0.1989672235984331, + "learning_rate": 1.2005472368498457e-05, + "loss": 0.2713, + "step": 2209 + }, + { + "epoch": 6.969315499606609, + "grad_norm": 0.22138011183206288, + "learning_rate": 1.1990178132002913e-05, + "loss": 0.2692, + "step": 2210 + }, + { + "epoch": 6.972462627852085, + "grad_norm": 0.2094379431636141, + "learning_rate": 1.1974894350369981e-05, + "loss": 0.2788, + "step": 2211 + }, + { + "epoch": 6.975609756097561, + "grad_norm": 0.20436997575009863, + "learning_rate": 1.195962103956298e-05, + "loss": 0.2759, + "step": 2212 + }, + { + "epoch": 6.978756884343037, + "grad_norm": 0.204009380946763, + "learning_rate": 1.1944358215534258e-05, + "loss": 0.2701, + "step": 2213 + }, + { + "epoch": 6.9819040125885135, + "grad_norm": 0.20886713208010613, + "learning_rate": 1.1929105894225248e-05, + "loss": 0.2687, + "step": 2214 + }, + { + "epoch": 6.985051140833989, + "grad_norm": 0.20522061292808225, + "learning_rate": 1.1913864091566372e-05, + "loss": 0.2628, + "step": 2215 + }, + { + "epoch": 6.988198269079465, + "grad_norm": 0.202811286102291, + "learning_rate": 1.1898632823477121e-05, + "loss": 0.2757, + "step": 2216 + }, + { + "epoch": 6.991345397324941, + "grad_norm": 0.2192831752403655, + "learning_rate": 1.1883412105865925e-05, + "loss": 0.2698, + "step": 2217 + }, + { + "epoch": 6.994492525570417, + "grad_norm": 0.19233397196404134, + "learning_rate": 1.1868201954630238e-05, + "loss": 0.2723, + "step": 2218 + }, + { + "epoch": 6.997639653815893, + "grad_norm": 0.210128078621367, + "learning_rate": 1.185300238565645e-05, + "loss": 0.2774, + "step": 2219 + }, + { + "epoch": 7.003147128245476, + "grad_norm": 0.5714481470801931, + "learning_rate": 1.183781341481991e-05, + "loss": 0.4569, + "step": 2220 + }, + { + "epoch": 7.006294256490952, + "grad_norm": 0.4250625166239754, + "learning_rate": 1.1822635057984906e-05, + "loss": 0.2112, + "step": 2221 + }, + { + "epoch": 7.009441384736428, + "grad_norm": 0.2691450770861746, + "learning_rate": 1.1807467331004619e-05, + "loss": 0.2138, + "step": 2222 + }, + { + "epoch": 7.012588512981904, + "grad_norm": 0.6396419703643319, + "learning_rate": 1.179231024972115e-05, + "loss": 0.2188, + "step": 2223 + }, + { + "epoch": 7.01573564122738, + "grad_norm": 0.3063619660764144, + "learning_rate": 1.177716382996546e-05, + "loss": 0.2141, + "step": 2224 + }, + { + "epoch": 7.018882769472856, + "grad_norm": 0.37212547491407416, + "learning_rate": 1.1762028087557393e-05, + "loss": 0.207, + "step": 2225 + }, + { + "epoch": 7.022029897718332, + "grad_norm": 0.381130447699765, + "learning_rate": 1.1746903038305626e-05, + "loss": 0.2121, + "step": 2226 + }, + { + "epoch": 7.025177025963808, + "grad_norm": 0.3005238930589458, + "learning_rate": 1.1731788698007675e-05, + "loss": 0.2127, + "step": 2227 + }, + { + "epoch": 7.028324154209284, + "grad_norm": 0.3300998957168307, + "learning_rate": 1.1716685082449879e-05, + "loss": 0.2237, + "step": 2228 + }, + { + "epoch": 7.03147128245476, + "grad_norm": 0.38114455718692064, + "learning_rate": 1.1701592207407355e-05, + "loss": 0.2176, + "step": 2229 + }, + { + "epoch": 7.034618410700236, + "grad_norm": 0.30283344304928583, + "learning_rate": 1.1686510088644014e-05, + "loss": 0.2086, + "step": 2230 + }, + { + "epoch": 7.037765538945712, + "grad_norm": 0.3331464112562727, + "learning_rate": 1.167143874191254e-05, + "loss": 0.2075, + "step": 2231 + }, + { + "epoch": 7.040912667191188, + "grad_norm": 0.29800299120189844, + "learning_rate": 1.1656378182954357e-05, + "loss": 0.2052, + "step": 2232 + }, + { + "epoch": 7.044059795436664, + "grad_norm": 0.2894375393244374, + "learning_rate": 1.1641328427499614e-05, + "loss": 0.2071, + "step": 2233 + }, + { + "epoch": 7.04720692368214, + "grad_norm": 0.3155503383974769, + "learning_rate": 1.1626289491267197e-05, + "loss": 0.2161, + "step": 2234 + }, + { + "epoch": 7.050354051927616, + "grad_norm": 0.26776854986829085, + "learning_rate": 1.161126138996467e-05, + "loss": 0.2022, + "step": 2235 + }, + { + "epoch": 7.053501180173092, + "grad_norm": 0.27601227009065155, + "learning_rate": 1.1596244139288286e-05, + "loss": 0.2066, + "step": 2236 + }, + { + "epoch": 7.056648308418568, + "grad_norm": 0.27656033063047314, + "learning_rate": 1.1581237754922984e-05, + "loss": 0.2104, + "step": 2237 + }, + { + "epoch": 7.059795436664044, + "grad_norm": 0.2449882270902797, + "learning_rate": 1.1566242252542325e-05, + "loss": 0.2073, + "step": 2238 + }, + { + "epoch": 7.06294256490952, + "grad_norm": 0.2768374274550793, + "learning_rate": 1.1551257647808524e-05, + "loss": 0.2102, + "step": 2239 + }, + { + "epoch": 7.066089693154996, + "grad_norm": 0.26011823472626777, + "learning_rate": 1.1536283956372402e-05, + "loss": 0.2142, + "step": 2240 + }, + { + "epoch": 7.069236821400472, + "grad_norm": 0.25819042597992947, + "learning_rate": 1.1521321193873395e-05, + "loss": 0.208, + "step": 2241 + }, + { + "epoch": 7.072383949645948, + "grad_norm": 0.24665055592686272, + "learning_rate": 1.1506369375939506e-05, + "loss": 0.208, + "step": 2242 + }, + { + "epoch": 7.075531077891424, + "grad_norm": 0.24751137987885644, + "learning_rate": 1.1491428518187321e-05, + "loss": 0.2092, + "step": 2243 + }, + { + "epoch": 7.0786782061369005, + "grad_norm": 0.24624896065268903, + "learning_rate": 1.1476498636221978e-05, + "loss": 0.2087, + "step": 2244 + }, + { + "epoch": 7.081825334382376, + "grad_norm": 0.25982701212215953, + "learning_rate": 1.1461579745637143e-05, + "loss": 0.2063, + "step": 2245 + }, + { + "epoch": 7.084972462627852, + "grad_norm": 0.2397365742144504, + "learning_rate": 1.1446671862015013e-05, + "loss": 0.2151, + "step": 2246 + }, + { + "epoch": 7.088119590873328, + "grad_norm": 0.23751859174229084, + "learning_rate": 1.1431775000926272e-05, + "loss": 0.2067, + "step": 2247 + }, + { + "epoch": 7.091266719118804, + "grad_norm": 0.23996775925549246, + "learning_rate": 1.1416889177930113e-05, + "loss": 0.2113, + "step": 2248 + }, + { + "epoch": 7.09441384736428, + "grad_norm": 0.24150863328124383, + "learning_rate": 1.1402014408574177e-05, + "loss": 0.2125, + "step": 2249 + }, + { + "epoch": 7.097560975609756, + "grad_norm": 0.24163504765540855, + "learning_rate": 1.1387150708394586e-05, + "loss": 0.1962, + "step": 2250 + }, + { + "epoch": 7.100708103855232, + "grad_norm": 0.24010609549944184, + "learning_rate": 1.1372298092915868e-05, + "loss": 0.2141, + "step": 2251 + }, + { + "epoch": 7.103855232100708, + "grad_norm": 0.2454335372395361, + "learning_rate": 1.1357456577651007e-05, + "loss": 0.2105, + "step": 2252 + }, + { + "epoch": 7.1070023603461845, + "grad_norm": 0.23394725001346658, + "learning_rate": 1.1342626178101374e-05, + "loss": 0.2079, + "step": 2253 + }, + { + "epoch": 7.11014948859166, + "grad_norm": 0.23777743303747212, + "learning_rate": 1.132780690975673e-05, + "loss": 0.2114, + "step": 2254 + }, + { + "epoch": 7.113296616837136, + "grad_norm": 0.22606234526414365, + "learning_rate": 1.131299878809522e-05, + "loss": 0.2081, + "step": 2255 + }, + { + "epoch": 7.116443745082612, + "grad_norm": 0.2418578090305854, + "learning_rate": 1.1298201828583332e-05, + "loss": 0.2066, + "step": 2256 + }, + { + "epoch": 7.119590873328088, + "grad_norm": 0.23113427714810778, + "learning_rate": 1.1283416046675916e-05, + "loss": 0.2102, + "step": 2257 + }, + { + "epoch": 7.122738001573564, + "grad_norm": 0.2381266978689901, + "learning_rate": 1.1268641457816117e-05, + "loss": 0.207, + "step": 2258 + }, + { + "epoch": 7.12588512981904, + "grad_norm": 0.2361934040445735, + "learning_rate": 1.1253878077435436e-05, + "loss": 0.2158, + "step": 2259 + }, + { + "epoch": 7.129032258064516, + "grad_norm": 0.21836833345649923, + "learning_rate": 1.1239125920953615e-05, + "loss": 0.2134, + "step": 2260 + }, + { + "epoch": 7.132179386309992, + "grad_norm": 0.24198498582441896, + "learning_rate": 1.122438500377871e-05, + "loss": 0.2042, + "step": 2261 + }, + { + "epoch": 7.1353265145554685, + "grad_norm": 0.22713295487622734, + "learning_rate": 1.1209655341307024e-05, + "loss": 0.2117, + "step": 2262 + }, + { + "epoch": 7.138473642800944, + "grad_norm": 0.2343503065926964, + "learning_rate": 1.1194936948923103e-05, + "loss": 0.2098, + "step": 2263 + }, + { + "epoch": 7.14162077104642, + "grad_norm": 0.22909025791500967, + "learning_rate": 1.1180229841999726e-05, + "loss": 0.2106, + "step": 2264 + }, + { + "epoch": 7.144767899291896, + "grad_norm": 0.227228211003999, + "learning_rate": 1.1165534035897881e-05, + "loss": 0.2192, + "step": 2265 + }, + { + "epoch": 7.147915027537372, + "grad_norm": 0.22905608109888015, + "learning_rate": 1.1150849545966766e-05, + "loss": 0.2085, + "step": 2266 + }, + { + "epoch": 7.151062155782848, + "grad_norm": 0.21727537194341173, + "learning_rate": 1.1136176387543736e-05, + "loss": 0.2122, + "step": 2267 + }, + { + "epoch": 7.154209284028324, + "grad_norm": 0.23840050117066902, + "learning_rate": 1.1121514575954327e-05, + "loss": 0.2149, + "step": 2268 + }, + { + "epoch": 7.1573564122738, + "grad_norm": 0.22511280292668318, + "learning_rate": 1.1106864126512233e-05, + "loss": 0.2026, + "step": 2269 + }, + { + "epoch": 7.160503540519276, + "grad_norm": 0.2319999499213673, + "learning_rate": 1.109222505451925e-05, + "loss": 0.2045, + "step": 2270 + }, + { + "epoch": 7.1636506687647525, + "grad_norm": 0.22621753505730435, + "learning_rate": 1.1077597375265325e-05, + "loss": 0.2024, + "step": 2271 + }, + { + "epoch": 7.166797797010228, + "grad_norm": 0.22239947016703665, + "learning_rate": 1.1062981104028479e-05, + "loss": 0.2096, + "step": 2272 + }, + { + "epoch": 7.169944925255704, + "grad_norm": 0.23183568916578862, + "learning_rate": 1.1048376256074831e-05, + "loss": 0.2046, + "step": 2273 + }, + { + "epoch": 7.17309205350118, + "grad_norm": 0.23415363186834018, + "learning_rate": 1.1033782846658567e-05, + "loss": 0.2126, + "step": 2274 + }, + { + "epoch": 7.176239181746656, + "grad_norm": 0.21870216739732176, + "learning_rate": 1.1019200891021932e-05, + "loss": 0.201, + "step": 2275 + }, + { + "epoch": 7.1793863099921325, + "grad_norm": 0.23538602690513877, + "learning_rate": 1.1004630404395193e-05, + "loss": 0.2138, + "step": 2276 + }, + { + "epoch": 7.182533438237608, + "grad_norm": 0.2278346344735737, + "learning_rate": 1.0990071401996647e-05, + "loss": 0.2097, + "step": 2277 + }, + { + "epoch": 7.185680566483084, + "grad_norm": 0.22286482289133921, + "learning_rate": 1.0975523899032603e-05, + "loss": 0.2082, + "step": 2278 + }, + { + "epoch": 7.18882769472856, + "grad_norm": 0.21831126634768455, + "learning_rate": 1.0960987910697338e-05, + "loss": 0.2098, + "step": 2279 + }, + { + "epoch": 7.191974822974037, + "grad_norm": 0.23172372964239146, + "learning_rate": 1.0946463452173135e-05, + "loss": 0.2096, + "step": 2280 + }, + { + "epoch": 7.195121951219512, + "grad_norm": 0.22322237214331764, + "learning_rate": 1.0931950538630199e-05, + "loss": 0.2132, + "step": 2281 + }, + { + "epoch": 7.198269079464988, + "grad_norm": 0.22598175395258135, + "learning_rate": 1.0917449185226702e-05, + "loss": 0.2108, + "step": 2282 + }, + { + "epoch": 7.201416207710464, + "grad_norm": 0.22815280201115662, + "learning_rate": 1.090295940710873e-05, + "loss": 0.2135, + "step": 2283 + }, + { + "epoch": 7.20456333595594, + "grad_norm": 0.22423887146026672, + "learning_rate": 1.0888481219410286e-05, + "loss": 0.2155, + "step": 2284 + }, + { + "epoch": 7.2077104642014165, + "grad_norm": 0.23680769002713375, + "learning_rate": 1.087401463725326e-05, + "loss": 0.2115, + "step": 2285 + }, + { + "epoch": 7.210857592446892, + "grad_norm": 0.22339424747961625, + "learning_rate": 1.0859559675747427e-05, + "loss": 0.2073, + "step": 2286 + }, + { + "epoch": 7.214004720692368, + "grad_norm": 0.22958762279363118, + "learning_rate": 1.0845116349990418e-05, + "loss": 0.2102, + "step": 2287 + }, + { + "epoch": 7.217151848937844, + "grad_norm": 0.21905123849431263, + "learning_rate": 1.083068467506772e-05, + "loss": 0.2096, + "step": 2288 + }, + { + "epoch": 7.220298977183321, + "grad_norm": 0.2299465638743488, + "learning_rate": 1.0816264666052652e-05, + "loss": 0.2103, + "step": 2289 + }, + { + "epoch": 7.223446105428796, + "grad_norm": 0.22978612951320251, + "learning_rate": 1.0801856338006323e-05, + "loss": 0.2155, + "step": 2290 + }, + { + "epoch": 7.226593233674272, + "grad_norm": 0.22975189777607816, + "learning_rate": 1.0787459705977681e-05, + "loss": 0.2114, + "step": 2291 + }, + { + "epoch": 7.229740361919748, + "grad_norm": 0.22933284295055767, + "learning_rate": 1.0773074785003426e-05, + "loss": 0.2108, + "step": 2292 + }, + { + "epoch": 7.232887490165224, + "grad_norm": 0.23052487817754658, + "learning_rate": 1.0758701590108039e-05, + "loss": 0.2054, + "step": 2293 + }, + { + "epoch": 7.2360346184107005, + "grad_norm": 0.22513890179226442, + "learning_rate": 1.0744340136303765e-05, + "loss": 0.2069, + "step": 2294 + }, + { + "epoch": 7.239181746656176, + "grad_norm": 0.22537210978153835, + "learning_rate": 1.0729990438590558e-05, + "loss": 0.2154, + "step": 2295 + }, + { + "epoch": 7.242328874901652, + "grad_norm": 0.22711701620016747, + "learning_rate": 1.0715652511956122e-05, + "loss": 0.2117, + "step": 2296 + }, + { + "epoch": 7.245476003147128, + "grad_norm": 0.21447246807326145, + "learning_rate": 1.0701326371375842e-05, + "loss": 0.2099, + "step": 2297 + }, + { + "epoch": 7.248623131392605, + "grad_norm": 0.22924666327151738, + "learning_rate": 1.0687012031812818e-05, + "loss": 0.2059, + "step": 2298 + }, + { + "epoch": 7.25177025963808, + "grad_norm": 0.2161633462452467, + "learning_rate": 1.0672709508217796e-05, + "loss": 0.2071, + "step": 2299 + }, + { + "epoch": 7.254917387883556, + "grad_norm": 0.24671002964793948, + "learning_rate": 1.0658418815529204e-05, + "loss": 0.2194, + "step": 2300 + }, + { + "epoch": 7.258064516129032, + "grad_norm": 0.21221688808795114, + "learning_rate": 1.0644139968673101e-05, + "loss": 0.2182, + "step": 2301 + }, + { + "epoch": 7.261211644374509, + "grad_norm": 0.22568981603880797, + "learning_rate": 1.062987298256318e-05, + "loss": 0.2159, + "step": 2302 + }, + { + "epoch": 7.2643587726199845, + "grad_norm": 0.22104241538483152, + "learning_rate": 1.0615617872100752e-05, + "loss": 0.2041, + "step": 2303 + }, + { + "epoch": 7.26750590086546, + "grad_norm": 0.22669047159973574, + "learning_rate": 1.06013746521747e-05, + "loss": 0.2078, + "step": 2304 + }, + { + "epoch": 7.270653029110936, + "grad_norm": 0.2253716719320985, + "learning_rate": 1.0587143337661516e-05, + "loss": 0.2125, + "step": 2305 + }, + { + "epoch": 7.273800157356412, + "grad_norm": 0.22803877169918388, + "learning_rate": 1.0572923943425234e-05, + "loss": 0.2092, + "step": 2306 + }, + { + "epoch": 7.276947285601889, + "grad_norm": 0.2213507250976366, + "learning_rate": 1.0558716484317456e-05, + "loss": 0.2108, + "step": 2307 + }, + { + "epoch": 7.280094413847364, + "grad_norm": 0.2273540787687723, + "learning_rate": 1.05445209751773e-05, + "loss": 0.2134, + "step": 2308 + }, + { + "epoch": 7.28324154209284, + "grad_norm": 0.2138902730182339, + "learning_rate": 1.053033743083142e-05, + "loss": 0.2089, + "step": 2309 + }, + { + "epoch": 7.286388670338316, + "grad_norm": 0.23680899817380743, + "learning_rate": 1.0516165866093974e-05, + "loss": 0.2108, + "step": 2310 + }, + { + "epoch": 7.289535798583792, + "grad_norm": 0.22183476822950635, + "learning_rate": 1.0502006295766589e-05, + "loss": 0.2174, + "step": 2311 + }, + { + "epoch": 7.2926829268292686, + "grad_norm": 0.22004800892442652, + "learning_rate": 1.0487858734638385e-05, + "loss": 0.2151, + "step": 2312 + }, + { + "epoch": 7.295830055074744, + "grad_norm": 0.22181045274812225, + "learning_rate": 1.0473723197485914e-05, + "loss": 0.2025, + "step": 2313 + }, + { + "epoch": 7.29897718332022, + "grad_norm": 0.21908332323352983, + "learning_rate": 1.0459599699073206e-05, + "loss": 0.2162, + "step": 2314 + }, + { + "epoch": 7.302124311565696, + "grad_norm": 0.21884697231931952, + "learning_rate": 1.044548825415168e-05, + "loss": 0.2129, + "step": 2315 + }, + { + "epoch": 7.305271439811173, + "grad_norm": 0.2187517231572296, + "learning_rate": 1.043138887746018e-05, + "loss": 0.2092, + "step": 2316 + }, + { + "epoch": 7.3084185680566485, + "grad_norm": 0.22546922277138795, + "learning_rate": 1.041730158372496e-05, + "loss": 0.2062, + "step": 2317 + }, + { + "epoch": 7.311565696302124, + "grad_norm": 0.22614767597501462, + "learning_rate": 1.0403226387659628e-05, + "loss": 0.2141, + "step": 2318 + }, + { + "epoch": 7.3147128245476, + "grad_norm": 0.22707234003611404, + "learning_rate": 1.0389163303965186e-05, + "loss": 0.2122, + "step": 2319 + }, + { + "epoch": 7.317859952793077, + "grad_norm": 0.23186259964324954, + "learning_rate": 1.0375112347329946e-05, + "loss": 0.2146, + "step": 2320 + }, + { + "epoch": 7.321007081038553, + "grad_norm": 0.23276792906716168, + "learning_rate": 1.0361073532429594e-05, + "loss": 0.2103, + "step": 2321 + }, + { + "epoch": 7.324154209284028, + "grad_norm": 0.2074352547542711, + "learning_rate": 1.0347046873927104e-05, + "loss": 0.2104, + "step": 2322 + }, + { + "epoch": 7.327301337529504, + "grad_norm": 0.2236327394327096, + "learning_rate": 1.0333032386472775e-05, + "loss": 0.2155, + "step": 2323 + }, + { + "epoch": 7.33044846577498, + "grad_norm": 0.221050234723865, + "learning_rate": 1.0319030084704175e-05, + "loss": 0.2214, + "step": 2324 + }, + { + "epoch": 7.333595594020457, + "grad_norm": 0.2249617592191245, + "learning_rate": 1.0305039983246159e-05, + "loss": 0.2054, + "step": 2325 + }, + { + "epoch": 7.3367427222659325, + "grad_norm": 0.22698815261155295, + "learning_rate": 1.0291062096710837e-05, + "loss": 0.2071, + "step": 2326 + }, + { + "epoch": 7.339889850511408, + "grad_norm": 0.2268711614187744, + "learning_rate": 1.0277096439697552e-05, + "loss": 0.2145, + "step": 2327 + }, + { + "epoch": 7.343036978756884, + "grad_norm": 0.215143567561118, + "learning_rate": 1.0263143026792883e-05, + "loss": 0.207, + "step": 2328 + }, + { + "epoch": 7.34618410700236, + "grad_norm": 0.22328803868837543, + "learning_rate": 1.0249201872570614e-05, + "loss": 0.2183, + "step": 2329 + }, + { + "epoch": 7.349331235247837, + "grad_norm": 0.2218308643421254, + "learning_rate": 1.0235272991591732e-05, + "loss": 0.2099, + "step": 2330 + }, + { + "epoch": 7.352478363493312, + "grad_norm": 0.23227296918591858, + "learning_rate": 1.0221356398404398e-05, + "loss": 0.2096, + "step": 2331 + }, + { + "epoch": 7.355625491738788, + "grad_norm": 0.2387762137802973, + "learning_rate": 1.0207452107543955e-05, + "loss": 0.2065, + "step": 2332 + }, + { + "epoch": 7.358772619984264, + "grad_norm": 0.22570367340945718, + "learning_rate": 1.0193560133532868e-05, + "loss": 0.2131, + "step": 2333 + }, + { + "epoch": 7.361919748229741, + "grad_norm": 0.2306105201682074, + "learning_rate": 1.017968049088076e-05, + "loss": 0.2166, + "step": 2334 + }, + { + "epoch": 7.3650668764752165, + "grad_norm": 0.2247866318155448, + "learning_rate": 1.0165813194084375e-05, + "loss": 0.2065, + "step": 2335 + }, + { + "epoch": 7.368214004720692, + "grad_norm": 0.22844131668659315, + "learning_rate": 1.0151958257627541e-05, + "loss": 0.2094, + "step": 2336 + }, + { + "epoch": 7.371361132966168, + "grad_norm": 0.23333574403162458, + "learning_rate": 1.0138115695981207e-05, + "loss": 0.213, + "step": 2337 + }, + { + "epoch": 7.374508261211645, + "grad_norm": 0.21257237150019098, + "learning_rate": 1.0124285523603365e-05, + "loss": 0.2187, + "step": 2338 + }, + { + "epoch": 7.377655389457121, + "grad_norm": 0.22969384430433795, + "learning_rate": 1.01104677549391e-05, + "loss": 0.2108, + "step": 2339 + }, + { + "epoch": 7.380802517702596, + "grad_norm": 0.23754367381929004, + "learning_rate": 1.0096662404420501e-05, + "loss": 0.2132, + "step": 2340 + }, + { + "epoch": 7.383949645948072, + "grad_norm": 0.22700013636080565, + "learning_rate": 1.0082869486466729e-05, + "loss": 0.2067, + "step": 2341 + }, + { + "epoch": 7.387096774193548, + "grad_norm": 0.23919755857430938, + "learning_rate": 1.006908901548394e-05, + "loss": 0.2117, + "step": 2342 + }, + { + "epoch": 7.390243902439025, + "grad_norm": 0.227136733402989, + "learning_rate": 1.0055321005865277e-05, + "loss": 0.2162, + "step": 2343 + }, + { + "epoch": 7.3933910306845005, + "grad_norm": 0.23525073363793073, + "learning_rate": 1.0041565471990897e-05, + "loss": 0.2112, + "step": 2344 + }, + { + "epoch": 7.396538158929976, + "grad_norm": 0.2321185458009399, + "learning_rate": 1.0027822428227889e-05, + "loss": 0.215, + "step": 2345 + }, + { + "epoch": 7.399685287175452, + "grad_norm": 0.23528217492361306, + "learning_rate": 1.0014091888930344e-05, + "loss": 0.2142, + "step": 2346 + }, + { + "epoch": 7.402832415420928, + "grad_norm": 0.22749689788373387, + "learning_rate": 1.0000373868439248e-05, + "loss": 0.2158, + "step": 2347 + }, + { + "epoch": 7.405979543666405, + "grad_norm": 0.2404493638710273, + "learning_rate": 9.986668381082545e-06, + "loss": 0.2168, + "step": 2348 + }, + { + "epoch": 7.4091266719118805, + "grad_norm": 0.22585072391780345, + "learning_rate": 9.972975441175057e-06, + "loss": 0.2164, + "step": 2349 + }, + { + "epoch": 7.412273800157356, + "grad_norm": 0.23795916213633916, + "learning_rate": 9.959295063018526e-06, + "loss": 0.215, + "step": 2350 + }, + { + "epoch": 7.415420928402832, + "grad_norm": 0.23204552138933593, + "learning_rate": 9.945627260901571e-06, + "loss": 0.2174, + "step": 2351 + }, + { + "epoch": 7.418568056648309, + "grad_norm": 0.22824560817611173, + "learning_rate": 9.93197204909966e-06, + "loss": 0.2111, + "step": 2352 + }, + { + "epoch": 7.421715184893785, + "grad_norm": 0.2358749174129253, + "learning_rate": 9.918329441875129e-06, + "loss": 0.2132, + "step": 2353 + }, + { + "epoch": 7.42486231313926, + "grad_norm": 0.23304655894118764, + "learning_rate": 9.904699453477136e-06, + "loss": 0.2121, + "step": 2354 + }, + { + "epoch": 7.428009441384736, + "grad_norm": 0.2305516088388655, + "learning_rate": 9.891082098141667e-06, + "loss": 0.2165, + "step": 2355 + }, + { + "epoch": 7.431156569630213, + "grad_norm": 0.23079140563064027, + "learning_rate": 9.877477390091509e-06, + "loss": 0.2141, + "step": 2356 + }, + { + "epoch": 7.434303697875689, + "grad_norm": 0.22387025416375533, + "learning_rate": 9.863885343536238e-06, + "loss": 0.2121, + "step": 2357 + }, + { + "epoch": 7.4374508261211645, + "grad_norm": 0.22787402873623003, + "learning_rate": 9.850305972672214e-06, + "loss": 0.2203, + "step": 2358 + }, + { + "epoch": 7.44059795436664, + "grad_norm": 0.22535783554358702, + "learning_rate": 9.836739291682543e-06, + "loss": 0.2154, + "step": 2359 + }, + { + "epoch": 7.443745082612116, + "grad_norm": 0.22981126911531366, + "learning_rate": 9.823185314737104e-06, + "loss": 0.2156, + "step": 2360 + }, + { + "epoch": 7.446892210857593, + "grad_norm": 0.23338901289009809, + "learning_rate": 9.809644055992471e-06, + "loss": 0.2112, + "step": 2361 + }, + { + "epoch": 7.450039339103069, + "grad_norm": 0.24023663975496, + "learning_rate": 9.796115529591967e-06, + "loss": 0.2093, + "step": 2362 + }, + { + "epoch": 7.453186467348544, + "grad_norm": 0.22580520597689485, + "learning_rate": 9.78259974966559e-06, + "loss": 0.2175, + "step": 2363 + }, + { + "epoch": 7.45633359559402, + "grad_norm": 0.2221577009585905, + "learning_rate": 9.769096730330047e-06, + "loss": 0.2128, + "step": 2364 + }, + { + "epoch": 7.459480723839496, + "grad_norm": 0.2314324525926755, + "learning_rate": 9.755606485688695e-06, + "loss": 0.2064, + "step": 2365 + }, + { + "epoch": 7.462627852084973, + "grad_norm": 0.2234171977467309, + "learning_rate": 9.742129029831569e-06, + "loss": 0.2137, + "step": 2366 + }, + { + "epoch": 7.4657749803304485, + "grad_norm": 0.23731134897981873, + "learning_rate": 9.728664376835343e-06, + "loss": 0.2134, + "step": 2367 + }, + { + "epoch": 7.468922108575924, + "grad_norm": 0.21962718713348828, + "learning_rate": 9.7152125407633e-06, + "loss": 0.2108, + "step": 2368 + }, + { + "epoch": 7.4720692368214, + "grad_norm": 0.2207798183423775, + "learning_rate": 9.701773535665366e-06, + "loss": 0.2101, + "step": 2369 + }, + { + "epoch": 7.475216365066877, + "grad_norm": 0.23437916694512362, + "learning_rate": 9.688347375578033e-06, + "loss": 0.2154, + "step": 2370 + }, + { + "epoch": 7.478363493312353, + "grad_norm": 0.22408835369735966, + "learning_rate": 9.674934074524411e-06, + "loss": 0.2172, + "step": 2371 + }, + { + "epoch": 7.481510621557828, + "grad_norm": 0.22121992831685067, + "learning_rate": 9.661533646514142e-06, + "loss": 0.2088, + "step": 2372 + }, + { + "epoch": 7.484657749803304, + "grad_norm": 0.21478252709139647, + "learning_rate": 9.648146105543457e-06, + "loss": 0.213, + "step": 2373 + }, + { + "epoch": 7.487804878048781, + "grad_norm": 0.22236538402387201, + "learning_rate": 9.634771465595109e-06, + "loss": 0.2146, + "step": 2374 + }, + { + "epoch": 7.490952006294257, + "grad_norm": 0.2329798548119093, + "learning_rate": 9.62140974063838e-06, + "loss": 0.2147, + "step": 2375 + }, + { + "epoch": 7.4940991345397325, + "grad_norm": 0.20764196366436552, + "learning_rate": 9.608060944629065e-06, + "loss": 0.2158, + "step": 2376 + }, + { + "epoch": 7.497246262785208, + "grad_norm": 0.22039448738264225, + "learning_rate": 9.59472509150945e-06, + "loss": 0.2131, + "step": 2377 + }, + { + "epoch": 7.500393391030684, + "grad_norm": 0.21967232307742138, + "learning_rate": 9.581402195208307e-06, + "loss": 0.2155, + "step": 2378 + }, + { + "epoch": 7.503540519276161, + "grad_norm": 0.23165634584475214, + "learning_rate": 9.568092269640867e-06, + "loss": 0.2058, + "step": 2379 + }, + { + "epoch": 7.506687647521637, + "grad_norm": 0.21342819732195714, + "learning_rate": 9.554795328708833e-06, + "loss": 0.2212, + "step": 2380 + }, + { + "epoch": 7.5098347757671124, + "grad_norm": 0.21653855605412423, + "learning_rate": 9.541511386300321e-06, + "loss": 0.2184, + "step": 2381 + }, + { + "epoch": 7.512981904012588, + "grad_norm": 0.212878668638118, + "learning_rate": 9.528240456289887e-06, + "loss": 0.2191, + "step": 2382 + }, + { + "epoch": 7.516129032258064, + "grad_norm": 0.22346959923074045, + "learning_rate": 9.5149825525385e-06, + "loss": 0.214, + "step": 2383 + }, + { + "epoch": 7.519276160503541, + "grad_norm": 0.22445356716908701, + "learning_rate": 9.5017376888935e-06, + "loss": 0.2115, + "step": 2384 + }, + { + "epoch": 7.522423288749017, + "grad_norm": 0.22584124484887466, + "learning_rate": 9.488505879188638e-06, + "loss": 0.2104, + "step": 2385 + }, + { + "epoch": 7.525570416994492, + "grad_norm": 0.2278568808064928, + "learning_rate": 9.475287137244006e-06, + "loss": 0.2119, + "step": 2386 + }, + { + "epoch": 7.528717545239968, + "grad_norm": 0.22171779725801655, + "learning_rate": 9.462081476866061e-06, + "loss": 0.2092, + "step": 2387 + }, + { + "epoch": 7.531864673485445, + "grad_norm": 0.22622915992130538, + "learning_rate": 9.44888891184758e-06, + "loss": 0.2116, + "step": 2388 + }, + { + "epoch": 7.535011801730921, + "grad_norm": 0.21608386506577046, + "learning_rate": 9.435709455967696e-06, + "loss": 0.2125, + "step": 2389 + }, + { + "epoch": 7.5381589299763965, + "grad_norm": 0.2190850436150323, + "learning_rate": 9.422543122991816e-06, + "loss": 0.215, + "step": 2390 + }, + { + "epoch": 7.541306058221872, + "grad_norm": 0.22266706339169948, + "learning_rate": 9.409389926671652e-06, + "loss": 0.2231, + "step": 2391 + }, + { + "epoch": 7.544453186467349, + "grad_norm": 0.223823120767086, + "learning_rate": 9.396249880745208e-06, + "loss": 0.2096, + "step": 2392 + }, + { + "epoch": 7.547600314712825, + "grad_norm": 0.21474334984155782, + "learning_rate": 9.383122998936728e-06, + "loss": 0.2211, + "step": 2393 + }, + { + "epoch": 7.550747442958301, + "grad_norm": 0.22017470608910686, + "learning_rate": 9.370009294956731e-06, + "loss": 0.2127, + "step": 2394 + }, + { + "epoch": 7.553894571203776, + "grad_norm": 0.22623985048590417, + "learning_rate": 9.356908782501953e-06, + "loss": 0.2079, + "step": 2395 + }, + { + "epoch": 7.557041699449252, + "grad_norm": 0.21790079455618502, + "learning_rate": 9.34382147525537e-06, + "loss": 0.2084, + "step": 2396 + }, + { + "epoch": 7.560188827694729, + "grad_norm": 0.23038553545405369, + "learning_rate": 9.330747386886145e-06, + "loss": 0.2144, + "step": 2397 + }, + { + "epoch": 7.563335955940205, + "grad_norm": 0.22823441881568057, + "learning_rate": 9.317686531049651e-06, + "loss": 0.2155, + "step": 2398 + }, + { + "epoch": 7.5664830841856805, + "grad_norm": 0.22875720062537966, + "learning_rate": 9.30463892138744e-06, + "loss": 0.2163, + "step": 2399 + }, + { + "epoch": 7.569630212431156, + "grad_norm": 0.2289164728738839, + "learning_rate": 9.291604571527218e-06, + "loss": 0.2136, + "step": 2400 + }, + { + "epoch": 7.572777340676632, + "grad_norm": 0.215177642884113, + "learning_rate": 9.27858349508285e-06, + "loss": 0.2091, + "step": 2401 + }, + { + "epoch": 7.575924468922109, + "grad_norm": 0.2274883491248114, + "learning_rate": 9.265575705654322e-06, + "loss": 0.2109, + "step": 2402 + }, + { + "epoch": 7.579071597167585, + "grad_norm": 0.23063099015253677, + "learning_rate": 9.252581216827778e-06, + "loss": 0.2007, + "step": 2403 + }, + { + "epoch": 7.58221872541306, + "grad_norm": 0.2218214636743055, + "learning_rate": 9.23960004217543e-06, + "loss": 0.2054, + "step": 2404 + }, + { + "epoch": 7.585365853658536, + "grad_norm": 0.23479587301596305, + "learning_rate": 9.226632195255612e-06, + "loss": 0.2109, + "step": 2405 + }, + { + "epoch": 7.588512981904013, + "grad_norm": 0.22498307321621322, + "learning_rate": 9.213677689612714e-06, + "loss": 0.2105, + "step": 2406 + }, + { + "epoch": 7.591660110149489, + "grad_norm": 0.23788611222524667, + "learning_rate": 9.200736538777214e-06, + "loss": 0.2082, + "step": 2407 + }, + { + "epoch": 7.5948072383949645, + "grad_norm": 0.21636929208162226, + "learning_rate": 9.18780875626563e-06, + "loss": 0.2097, + "step": 2408 + }, + { + "epoch": 7.59795436664044, + "grad_norm": 0.22912436986509432, + "learning_rate": 9.174894355580514e-06, + "loss": 0.208, + "step": 2409 + }, + { + "epoch": 7.601101494885917, + "grad_norm": 0.22835007748766775, + "learning_rate": 9.161993350210457e-06, + "loss": 0.2086, + "step": 2410 + }, + { + "epoch": 7.604248623131393, + "grad_norm": 0.22707280711620756, + "learning_rate": 9.149105753630033e-06, + "loss": 0.2137, + "step": 2411 + }, + { + "epoch": 7.607395751376869, + "grad_norm": 0.23071685677491993, + "learning_rate": 9.136231579299843e-06, + "loss": 0.2116, + "step": 2412 + }, + { + "epoch": 7.610542879622344, + "grad_norm": 0.21489892065441007, + "learning_rate": 9.123370840666437e-06, + "loss": 0.2108, + "step": 2413 + }, + { + "epoch": 7.61369000786782, + "grad_norm": 0.22666390058768177, + "learning_rate": 9.110523551162355e-06, + "loss": 0.2129, + "step": 2414 + }, + { + "epoch": 7.616837136113297, + "grad_norm": 0.22108033103329094, + "learning_rate": 9.097689724206085e-06, + "loss": 0.2147, + "step": 2415 + }, + { + "epoch": 7.619984264358773, + "grad_norm": 0.22927754219244445, + "learning_rate": 9.084869373202036e-06, + "loss": 0.2122, + "step": 2416 + }, + { + "epoch": 7.6231313926042485, + "grad_norm": 0.21679471833735905, + "learning_rate": 9.072062511540583e-06, + "loss": 0.2118, + "step": 2417 + }, + { + "epoch": 7.626278520849724, + "grad_norm": 0.22320372473610817, + "learning_rate": 9.059269152597964e-06, + "loss": 0.2146, + "step": 2418 + }, + { + "epoch": 7.6294256490952, + "grad_norm": 0.22512553465766197, + "learning_rate": 9.046489309736348e-06, + "loss": 0.212, + "step": 2419 + }, + { + "epoch": 7.632572777340677, + "grad_norm": 0.2183847269049812, + "learning_rate": 9.033722996303768e-06, + "loss": 0.2158, + "step": 2420 + }, + { + "epoch": 7.635719905586153, + "grad_norm": 0.22036959109568996, + "learning_rate": 9.020970225634136e-06, + "loss": 0.2164, + "step": 2421 + }, + { + "epoch": 7.6388670338316285, + "grad_norm": 0.22687740918307078, + "learning_rate": 9.008231011047213e-06, + "loss": 0.2146, + "step": 2422 + }, + { + "epoch": 7.642014162077104, + "grad_norm": 0.22039894721380168, + "learning_rate": 8.995505365848605e-06, + "loss": 0.2133, + "step": 2423 + }, + { + "epoch": 7.645161290322581, + "grad_norm": 0.22178747844096786, + "learning_rate": 8.982793303329751e-06, + "loss": 0.218, + "step": 2424 + }, + { + "epoch": 7.648308418568057, + "grad_norm": 0.21692147792936287, + "learning_rate": 8.970094836767888e-06, + "loss": 0.222, + "step": 2425 + }, + { + "epoch": 7.651455546813533, + "grad_norm": 0.2138257230931039, + "learning_rate": 8.957409979426072e-06, + "loss": 0.2089, + "step": 2426 + }, + { + "epoch": 7.654602675059008, + "grad_norm": 0.2265413919770675, + "learning_rate": 8.944738744553121e-06, + "loss": 0.2172, + "step": 2427 + }, + { + "epoch": 7.657749803304485, + "grad_norm": 0.22297257163502948, + "learning_rate": 8.93208114538365e-06, + "loss": 0.2121, + "step": 2428 + }, + { + "epoch": 7.660896931549961, + "grad_norm": 0.22148672698728822, + "learning_rate": 8.91943719513801e-06, + "loss": 0.2088, + "step": 2429 + }, + { + "epoch": 7.664044059795437, + "grad_norm": 0.21736370577760047, + "learning_rate": 8.906806907022311e-06, + "loss": 0.2153, + "step": 2430 + }, + { + "epoch": 7.6671911880409125, + "grad_norm": 0.22466767010680358, + "learning_rate": 8.894190294228391e-06, + "loss": 0.21, + "step": 2431 + }, + { + "epoch": 7.670338316286388, + "grad_norm": 0.22112902451894795, + "learning_rate": 8.881587369933799e-06, + "loss": 0.2175, + "step": 2432 + }, + { + "epoch": 7.673485444531865, + "grad_norm": 0.2214421346522819, + "learning_rate": 8.8689981473018e-06, + "loss": 0.214, + "step": 2433 + }, + { + "epoch": 7.676632572777341, + "grad_norm": 0.21631613538316005, + "learning_rate": 8.856422639481324e-06, + "loss": 0.2084, + "step": 2434 + }, + { + "epoch": 7.679779701022817, + "grad_norm": 0.22609337537385527, + "learning_rate": 8.843860859607001e-06, + "loss": 0.2147, + "step": 2435 + }, + { + "epoch": 7.682926829268292, + "grad_norm": 0.22739242141285246, + "learning_rate": 8.831312820799108e-06, + "loss": 0.2177, + "step": 2436 + }, + { + "epoch": 7.686073957513768, + "grad_norm": 0.22087226671865368, + "learning_rate": 8.81877853616358e-06, + "loss": 0.215, + "step": 2437 + }, + { + "epoch": 7.689221085759245, + "grad_norm": 0.226834895900349, + "learning_rate": 8.80625801879197e-06, + "loss": 0.212, + "step": 2438 + }, + { + "epoch": 7.692368214004721, + "grad_norm": 0.23470020627965657, + "learning_rate": 8.793751281761473e-06, + "loss": 0.215, + "step": 2439 + }, + { + "epoch": 7.6955153422501965, + "grad_norm": 0.21858002999304377, + "learning_rate": 8.781258338134882e-06, + "loss": 0.2195, + "step": 2440 + }, + { + "epoch": 7.698662470495672, + "grad_norm": 0.22766506836176625, + "learning_rate": 8.768779200960573e-06, + "loss": 0.2141, + "step": 2441 + }, + { + "epoch": 7.701809598741149, + "grad_norm": 0.2416124281891341, + "learning_rate": 8.756313883272518e-06, + "loss": 0.206, + "step": 2442 + }, + { + "epoch": 7.704956726986625, + "grad_norm": 0.22684959206739022, + "learning_rate": 8.74386239809024e-06, + "loss": 0.217, + "step": 2443 + }, + { + "epoch": 7.708103855232101, + "grad_norm": 0.22333064815479248, + "learning_rate": 8.731424758418837e-06, + "loss": 0.2238, + "step": 2444 + }, + { + "epoch": 7.711250983477576, + "grad_norm": 0.23426086091700454, + "learning_rate": 8.719000977248909e-06, + "loss": 0.2159, + "step": 2445 + }, + { + "epoch": 7.714398111723053, + "grad_norm": 0.23296607438783742, + "learning_rate": 8.706591067556625e-06, + "loss": 0.2149, + "step": 2446 + }, + { + "epoch": 7.717545239968529, + "grad_norm": 0.21675819624949147, + "learning_rate": 8.694195042303631e-06, + "loss": 0.2143, + "step": 2447 + }, + { + "epoch": 7.720692368214005, + "grad_norm": 0.21790836958624593, + "learning_rate": 8.681812914437088e-06, + "loss": 0.2163, + "step": 2448 + }, + { + "epoch": 7.7238394964594805, + "grad_norm": 0.2274002177866238, + "learning_rate": 8.669444696889645e-06, + "loss": 0.2132, + "step": 2449 + }, + { + "epoch": 7.726986624704956, + "grad_norm": 0.22632872665252599, + "learning_rate": 8.657090402579406e-06, + "loss": 0.2117, + "step": 2450 + }, + { + "epoch": 7.730133752950433, + "grad_norm": 0.22423037534942736, + "learning_rate": 8.64475004440995e-06, + "loss": 0.2147, + "step": 2451 + }, + { + "epoch": 7.733280881195909, + "grad_norm": 0.21721365887439806, + "learning_rate": 8.632423635270284e-06, + "loss": 0.213, + "step": 2452 + }, + { + "epoch": 7.736428009441385, + "grad_norm": 0.22467979551975023, + "learning_rate": 8.620111188034862e-06, + "loss": 0.2131, + "step": 2453 + }, + { + "epoch": 7.7395751376868605, + "grad_norm": 0.21890044167587794, + "learning_rate": 8.60781271556354e-06, + "loss": 0.2233, + "step": 2454 + }, + { + "epoch": 7.742722265932336, + "grad_norm": 0.23103144172132506, + "learning_rate": 8.595528230701591e-06, + "loss": 0.2125, + "step": 2455 + }, + { + "epoch": 7.745869394177813, + "grad_norm": 0.21442112221294987, + "learning_rate": 8.583257746279678e-06, + "loss": 0.2132, + "step": 2456 + }, + { + "epoch": 7.749016522423289, + "grad_norm": 0.23258158194634532, + "learning_rate": 8.571001275113825e-06, + "loss": 0.2121, + "step": 2457 + }, + { + "epoch": 7.752163650668765, + "grad_norm": 0.218808737474262, + "learning_rate": 8.55875883000544e-06, + "loss": 0.2099, + "step": 2458 + }, + { + "epoch": 7.755310778914241, + "grad_norm": 0.22662779438337177, + "learning_rate": 8.546530423741258e-06, + "loss": 0.2139, + "step": 2459 + }, + { + "epoch": 7.758457907159717, + "grad_norm": 0.22023777628464106, + "learning_rate": 8.534316069093385e-06, + "loss": 0.2198, + "step": 2460 + }, + { + "epoch": 7.761605035405193, + "grad_norm": 0.22730348731809383, + "learning_rate": 8.52211577881922e-06, + "loss": 0.2203, + "step": 2461 + }, + { + "epoch": 7.764752163650669, + "grad_norm": 0.224778305802601, + "learning_rate": 8.509929565661486e-06, + "loss": 0.2144, + "step": 2462 + }, + { + "epoch": 7.7678992918961445, + "grad_norm": 0.21387975638987688, + "learning_rate": 8.497757442348194e-06, + "loss": 0.2193, + "step": 2463 + }, + { + "epoch": 7.771046420141621, + "grad_norm": 0.22076113317664492, + "learning_rate": 8.485599421592648e-06, + "loss": 0.2212, + "step": 2464 + }, + { + "epoch": 7.774193548387097, + "grad_norm": 0.2191624701650899, + "learning_rate": 8.473455516093427e-06, + "loss": 0.2194, + "step": 2465 + }, + { + "epoch": 7.777340676632573, + "grad_norm": 0.22080484859112698, + "learning_rate": 8.461325738534349e-06, + "loss": 0.2166, + "step": 2466 + }, + { + "epoch": 7.780487804878049, + "grad_norm": 0.2286105390431359, + "learning_rate": 8.449210101584495e-06, + "loss": 0.2101, + "step": 2467 + }, + { + "epoch": 7.783634933123524, + "grad_norm": 0.22181973516008022, + "learning_rate": 8.43710861789816e-06, + "loss": 0.2111, + "step": 2468 + }, + { + "epoch": 7.786782061369001, + "grad_norm": 0.22834807098550453, + "learning_rate": 8.42502130011487e-06, + "loss": 0.2203, + "step": 2469 + }, + { + "epoch": 7.789929189614477, + "grad_norm": 0.2155756950754227, + "learning_rate": 8.412948160859346e-06, + "loss": 0.2078, + "step": 2470 + }, + { + "epoch": 7.793076317859953, + "grad_norm": 0.22460211874784158, + "learning_rate": 8.400889212741506e-06, + "loss": 0.2138, + "step": 2471 + }, + { + "epoch": 7.7962234461054285, + "grad_norm": 0.23266223147738663, + "learning_rate": 8.388844468356447e-06, + "loss": 0.2082, + "step": 2472 + }, + { + "epoch": 7.799370574350904, + "grad_norm": 0.22066055467773144, + "learning_rate": 8.37681394028442e-06, + "loss": 0.2167, + "step": 2473 + }, + { + "epoch": 7.802517702596381, + "grad_norm": 0.22436154174278092, + "learning_rate": 8.364797641090839e-06, + "loss": 0.2219, + "step": 2474 + }, + { + "epoch": 7.805664830841857, + "grad_norm": 0.22160888608683596, + "learning_rate": 8.352795583326255e-06, + "loss": 0.2205, + "step": 2475 + }, + { + "epoch": 7.808811959087333, + "grad_norm": 0.2226551886278477, + "learning_rate": 8.340807779526345e-06, + "loss": 0.2176, + "step": 2476 + }, + { + "epoch": 7.811959087332809, + "grad_norm": 0.21580763083372614, + "learning_rate": 8.328834242211887e-06, + "loss": 0.2163, + "step": 2477 + }, + { + "epoch": 7.815106215578285, + "grad_norm": 0.22040811507283187, + "learning_rate": 8.316874983888774e-06, + "loss": 0.2107, + "step": 2478 + }, + { + "epoch": 7.818253343823761, + "grad_norm": 0.2291989714865517, + "learning_rate": 8.304930017047969e-06, + "loss": 0.2032, + "step": 2479 + }, + { + "epoch": 7.821400472069237, + "grad_norm": 0.22911848156820763, + "learning_rate": 8.292999354165525e-06, + "loss": 0.2082, + "step": 2480 + }, + { + "epoch": 7.8245476003147125, + "grad_norm": 0.23926926990020336, + "learning_rate": 8.281083007702546e-06, + "loss": 0.2095, + "step": 2481 + }, + { + "epoch": 7.827694728560189, + "grad_norm": 0.22512534340178053, + "learning_rate": 8.26918099010518e-06, + "loss": 0.2173, + "step": 2482 + }, + { + "epoch": 7.830841856805665, + "grad_norm": 0.2167774809962824, + "learning_rate": 8.25729331380462e-06, + "loss": 0.2151, + "step": 2483 + }, + { + "epoch": 7.833988985051141, + "grad_norm": 0.2129622004627707, + "learning_rate": 8.245419991217063e-06, + "loss": 0.2175, + "step": 2484 + }, + { + "epoch": 7.837136113296617, + "grad_norm": 0.2234270903188792, + "learning_rate": 8.233561034743737e-06, + "loss": 0.2117, + "step": 2485 + }, + { + "epoch": 7.840283241542092, + "grad_norm": 0.22836568327537848, + "learning_rate": 8.221716456770838e-06, + "loss": 0.2136, + "step": 2486 + }, + { + "epoch": 7.843430369787569, + "grad_norm": 0.21719647596939415, + "learning_rate": 8.209886269669569e-06, + "loss": 0.216, + "step": 2487 + }, + { + "epoch": 7.846577498033045, + "grad_norm": 0.2266420312585156, + "learning_rate": 8.198070485796087e-06, + "loss": 0.2156, + "step": 2488 + }, + { + "epoch": 7.849724626278521, + "grad_norm": 0.2231981024827487, + "learning_rate": 8.186269117491515e-06, + "loss": 0.2078, + "step": 2489 + }, + { + "epoch": 7.8528717545239966, + "grad_norm": 0.2216098578898215, + "learning_rate": 8.174482177081914e-06, + "loss": 0.2098, + "step": 2490 + }, + { + "epoch": 7.856018882769473, + "grad_norm": 0.22092053778425194, + "learning_rate": 8.162709676878274e-06, + "loss": 0.2149, + "step": 2491 + }, + { + "epoch": 7.859166011014949, + "grad_norm": 0.2181787736211054, + "learning_rate": 8.15095162917651e-06, + "loss": 0.2147, + "step": 2492 + }, + { + "epoch": 7.862313139260425, + "grad_norm": 0.21707933494315532, + "learning_rate": 8.13920804625743e-06, + "loss": 0.2144, + "step": 2493 + }, + { + "epoch": 7.865460267505901, + "grad_norm": 0.21885761883840674, + "learning_rate": 8.12747894038675e-06, + "loss": 0.2176, + "step": 2494 + }, + { + "epoch": 7.868607395751377, + "grad_norm": 0.21519099154642782, + "learning_rate": 8.115764323815047e-06, + "loss": 0.2092, + "step": 2495 + }, + { + "epoch": 7.871754523996853, + "grad_norm": 0.2218096537191133, + "learning_rate": 8.10406420877778e-06, + "loss": 0.2142, + "step": 2496 + }, + { + "epoch": 7.874901652242329, + "grad_norm": 0.2160268035618521, + "learning_rate": 8.092378607495259e-06, + "loss": 0.2128, + "step": 2497 + }, + { + "epoch": 7.878048780487805, + "grad_norm": 0.21717472587214887, + "learning_rate": 8.080707532172621e-06, + "loss": 0.2089, + "step": 2498 + }, + { + "epoch": 7.881195908733281, + "grad_norm": 0.22978681303101012, + "learning_rate": 8.069050994999859e-06, + "loss": 0.2159, + "step": 2499 + }, + { + "epoch": 7.884343036978757, + "grad_norm": 0.2145432190672201, + "learning_rate": 8.057409008151747e-06, + "loss": 0.2191, + "step": 2500 + }, + { + "epoch": 7.887490165224233, + "grad_norm": 0.21225002883741487, + "learning_rate": 8.04578158378789e-06, + "loss": 0.213, + "step": 2501 + }, + { + "epoch": 7.890637293469709, + "grad_norm": 0.2238498839813341, + "learning_rate": 8.034168734052665e-06, + "loss": 0.2166, + "step": 2502 + }, + { + "epoch": 7.893784421715185, + "grad_norm": 0.20967970929834082, + "learning_rate": 8.022570471075239e-06, + "loss": 0.221, + "step": 2503 + }, + { + "epoch": 7.8969315499606605, + "grad_norm": 0.21708326506558323, + "learning_rate": 8.010986806969536e-06, + "loss": 0.2168, + "step": 2504 + }, + { + "epoch": 7.900078678206137, + "grad_norm": 0.2194455638945262, + "learning_rate": 7.999417753834237e-06, + "loss": 0.2159, + "step": 2505 + }, + { + "epoch": 7.903225806451613, + "grad_norm": 0.22999330162628176, + "learning_rate": 7.987863323752768e-06, + "loss": 0.2152, + "step": 2506 + }, + { + "epoch": 7.906372934697089, + "grad_norm": 0.23123822749524514, + "learning_rate": 7.976323528793253e-06, + "loss": 0.2114, + "step": 2507 + }, + { + "epoch": 7.909520062942565, + "grad_norm": 0.20876015592600447, + "learning_rate": 7.964798381008572e-06, + "loss": 0.2187, + "step": 2508 + }, + { + "epoch": 7.912667191188041, + "grad_norm": 0.23282553158404024, + "learning_rate": 7.95328789243627e-06, + "loss": 0.2141, + "step": 2509 + }, + { + "epoch": 7.915814319433517, + "grad_norm": 0.21665198107506672, + "learning_rate": 7.941792075098607e-06, + "loss": 0.22, + "step": 2510 + }, + { + "epoch": 7.918961447678993, + "grad_norm": 0.21353778052651864, + "learning_rate": 7.930310941002498e-06, + "loss": 0.2139, + "step": 2511 + }, + { + "epoch": 7.922108575924469, + "grad_norm": 0.21878104539924628, + "learning_rate": 7.918844502139542e-06, + "loss": 0.2178, + "step": 2512 + }, + { + "epoch": 7.925255704169945, + "grad_norm": 0.22382902656974832, + "learning_rate": 7.907392770485981e-06, + "loss": 0.2182, + "step": 2513 + }, + { + "epoch": 7.928402832415421, + "grad_norm": 0.21896656616114246, + "learning_rate": 7.895955758002692e-06, + "loss": 0.2046, + "step": 2514 + }, + { + "epoch": 7.931549960660897, + "grad_norm": 0.23683519616131976, + "learning_rate": 7.884533476635183e-06, + "loss": 0.2152, + "step": 2515 + }, + { + "epoch": 7.934697088906373, + "grad_norm": 0.2166277071545748, + "learning_rate": 7.873125938313572e-06, + "loss": 0.2107, + "step": 2516 + }, + { + "epoch": 7.937844217151849, + "grad_norm": 0.21951761687518714, + "learning_rate": 7.86173315495258e-06, + "loss": 0.2118, + "step": 2517 + }, + { + "epoch": 7.940991345397325, + "grad_norm": 0.2309665891777021, + "learning_rate": 7.850355138451522e-06, + "loss": 0.2136, + "step": 2518 + }, + { + "epoch": 7.944138473642801, + "grad_norm": 0.22022379458796373, + "learning_rate": 7.83899190069429e-06, + "loss": 0.2173, + "step": 2519 + }, + { + "epoch": 7.947285601888277, + "grad_norm": 0.2174782268352602, + "learning_rate": 7.827643453549325e-06, + "loss": 0.2192, + "step": 2520 + }, + { + "epoch": 7.950432730133753, + "grad_norm": 0.22169879590437622, + "learning_rate": 7.816309808869637e-06, + "loss": 0.2184, + "step": 2521 + }, + { + "epoch": 7.9535798583792285, + "grad_norm": 0.22065198802434965, + "learning_rate": 7.804990978492774e-06, + "loss": 0.2114, + "step": 2522 + }, + { + "epoch": 7.956726986624705, + "grad_norm": 0.2165885857788757, + "learning_rate": 7.793686974240795e-06, + "loss": 0.2132, + "step": 2523 + }, + { + "epoch": 7.959874114870181, + "grad_norm": 0.22336276390335888, + "learning_rate": 7.782397807920297e-06, + "loss": 0.2137, + "step": 2524 + }, + { + "epoch": 7.963021243115657, + "grad_norm": 0.21596192334489273, + "learning_rate": 7.771123491322353e-06, + "loss": 0.2162, + "step": 2525 + }, + { + "epoch": 7.966168371361133, + "grad_norm": 0.2122590682034009, + "learning_rate": 7.759864036222556e-06, + "loss": 0.2154, + "step": 2526 + }, + { + "epoch": 7.969315499606609, + "grad_norm": 0.2247749539500636, + "learning_rate": 7.748619454380947e-06, + "loss": 0.2143, + "step": 2527 + }, + { + "epoch": 7.972462627852085, + "grad_norm": 0.22164707239632309, + "learning_rate": 7.737389757542051e-06, + "loss": 0.22, + "step": 2528 + }, + { + "epoch": 7.975609756097561, + "grad_norm": 0.21942953911808707, + "learning_rate": 7.72617495743485e-06, + "loss": 0.2142, + "step": 2529 + }, + { + "epoch": 7.978756884343037, + "grad_norm": 0.2169333255982246, + "learning_rate": 7.714975065772747e-06, + "loss": 0.2167, + "step": 2530 + }, + { + "epoch": 7.9819040125885135, + "grad_norm": 0.226087767263674, + "learning_rate": 7.70379009425359e-06, + "loss": 0.219, + "step": 2531 + }, + { + "epoch": 7.985051140833989, + "grad_norm": 0.22119104996323627, + "learning_rate": 7.692620054559641e-06, + "loss": 0.2148, + "step": 2532 + }, + { + "epoch": 7.988198269079465, + "grad_norm": 0.22663737097763387, + "learning_rate": 7.681464958357565e-06, + "loss": 0.2134, + "step": 2533 + }, + { + "epoch": 7.991345397324941, + "grad_norm": 0.22890196789986905, + "learning_rate": 7.670324817298414e-06, + "loss": 0.2118, + "step": 2534 + }, + { + "epoch": 7.994492525570417, + "grad_norm": 0.21331469668678782, + "learning_rate": 7.659199643017628e-06, + "loss": 0.2174, + "step": 2535 + }, + { + "epoch": 7.997639653815893, + "grad_norm": 0.2216721447509053, + "learning_rate": 7.648089447135005e-06, + "loss": 0.2133, + "step": 2536 + }, + { + "epoch": 8.003147128245477, + "grad_norm": 0.4349977764263755, + "learning_rate": 7.63699424125471e-06, + "loss": 0.3762, + "step": 2537 + }, + { + "epoch": 8.006294256490952, + "grad_norm": 0.26742441223502816, + "learning_rate": 7.62591403696525e-06, + "loss": 0.1644, + "step": 2538 + }, + { + "epoch": 8.009441384736428, + "grad_norm": 0.388106689437148, + "learning_rate": 7.614848845839449e-06, + "loss": 0.1651, + "step": 2539 + }, + { + "epoch": 8.012588512981903, + "grad_norm": 0.5323255528110843, + "learning_rate": 7.603798679434472e-06, + "loss": 0.1682, + "step": 2540 + }, + { + "epoch": 8.01573564122738, + "grad_norm": 0.2923949539924144, + "learning_rate": 7.592763549291768e-06, + "loss": 0.1656, + "step": 2541 + }, + { + "epoch": 8.018882769472857, + "grad_norm": 0.38423574858459186, + "learning_rate": 7.58174346693711e-06, + "loss": 0.1721, + "step": 2542 + }, + { + "epoch": 8.022029897718332, + "grad_norm": 0.3833329737299957, + "learning_rate": 7.570738443880521e-06, + "loss": 0.1648, + "step": 2543 + }, + { + "epoch": 8.025177025963808, + "grad_norm": 0.30621817890570635, + "learning_rate": 7.559748491616319e-06, + "loss": 0.1675, + "step": 2544 + }, + { + "epoch": 8.028324154209285, + "grad_norm": 0.3082779392109665, + "learning_rate": 7.54877362162308e-06, + "loss": 0.1712, + "step": 2545 + }, + { + "epoch": 8.03147128245476, + "grad_norm": 0.3706427459165308, + "learning_rate": 7.537813845363604e-06, + "loss": 0.1665, + "step": 2546 + }, + { + "epoch": 8.034618410700237, + "grad_norm": 0.32385321291038643, + "learning_rate": 7.5268691742849665e-06, + "loss": 0.1657, + "step": 2547 + }, + { + "epoch": 8.037765538945711, + "grad_norm": 0.25807590952385684, + "learning_rate": 7.5159396198184246e-06, + "loss": 0.1641, + "step": 2548 + }, + { + "epoch": 8.040912667191188, + "grad_norm": 0.2867986489695364, + "learning_rate": 7.505025193379478e-06, + "loss": 0.1646, + "step": 2549 + }, + { + "epoch": 8.044059795436665, + "grad_norm": 0.29169617877555604, + "learning_rate": 7.494125906367801e-06, + "loss": 0.1613, + "step": 2550 + }, + { + "epoch": 8.04720692368214, + "grad_norm": 0.2527672771226682, + "learning_rate": 7.48324177016728e-06, + "loss": 0.1572, + "step": 2551 + }, + { + "epoch": 8.050354051927616, + "grad_norm": 0.2836106342803346, + "learning_rate": 7.47237279614595e-06, + "loss": 0.1687, + "step": 2552 + }, + { + "epoch": 8.053501180173091, + "grad_norm": 0.2975001770525274, + "learning_rate": 7.461518995656034e-06, + "loss": 0.164, + "step": 2553 + }, + { + "epoch": 8.056648308418568, + "grad_norm": 0.2735520742422293, + "learning_rate": 7.450680380033897e-06, + "loss": 0.1683, + "step": 2554 + }, + { + "epoch": 8.059795436664045, + "grad_norm": 0.2449222310949691, + "learning_rate": 7.439856960600038e-06, + "loss": 0.1634, + "step": 2555 + }, + { + "epoch": 8.06294256490952, + "grad_norm": 0.26957950448412327, + "learning_rate": 7.429048748659098e-06, + "loss": 0.164, + "step": 2556 + }, + { + "epoch": 8.066089693154996, + "grad_norm": 0.2468154213899654, + "learning_rate": 7.418255755499817e-06, + "loss": 0.1635, + "step": 2557 + }, + { + "epoch": 8.069236821400471, + "grad_norm": 0.24374149361084566, + "learning_rate": 7.407477992395058e-06, + "loss": 0.1653, + "step": 2558 + }, + { + "epoch": 8.072383949645948, + "grad_norm": 0.2639345175746216, + "learning_rate": 7.396715470601759e-06, + "loss": 0.1654, + "step": 2559 + }, + { + "epoch": 8.075531077891425, + "grad_norm": 0.2592056292755547, + "learning_rate": 7.385968201360953e-06, + "loss": 0.17, + "step": 2560 + }, + { + "epoch": 8.0786782061369, + "grad_norm": 0.24098620754378253, + "learning_rate": 7.375236195897737e-06, + "loss": 0.1598, + "step": 2561 + }, + { + "epoch": 8.081825334382376, + "grad_norm": 0.2392626040847066, + "learning_rate": 7.364519465421265e-06, + "loss": 0.1664, + "step": 2562 + }, + { + "epoch": 8.084972462627853, + "grad_norm": 0.23598245288642505, + "learning_rate": 7.353818021124745e-06, + "loss": 0.1676, + "step": 2563 + }, + { + "epoch": 8.088119590873328, + "grad_norm": 0.2434889824957857, + "learning_rate": 7.343131874185396e-06, + "loss": 0.1528, + "step": 2564 + }, + { + "epoch": 8.091266719118805, + "grad_norm": 0.23185343146126583, + "learning_rate": 7.332461035764492e-06, + "loss": 0.162, + "step": 2565 + }, + { + "epoch": 8.09441384736428, + "grad_norm": 0.24759550291938406, + "learning_rate": 7.32180551700729e-06, + "loss": 0.1643, + "step": 2566 + }, + { + "epoch": 8.097560975609756, + "grad_norm": 0.24247858902191025, + "learning_rate": 7.311165329043064e-06, + "loss": 0.1676, + "step": 2567 + }, + { + "epoch": 8.100708103855233, + "grad_norm": 0.23072668662855816, + "learning_rate": 7.300540482985061e-06, + "loss": 0.1659, + "step": 2568 + }, + { + "epoch": 8.103855232100708, + "grad_norm": 0.23644287711658252, + "learning_rate": 7.289930989930518e-06, + "loss": 0.1628, + "step": 2569 + }, + { + "epoch": 8.107002360346184, + "grad_norm": 0.22627740006753835, + "learning_rate": 7.279336860960633e-06, + "loss": 0.1606, + "step": 2570 + }, + { + "epoch": 8.11014948859166, + "grad_norm": 0.2291150949130993, + "learning_rate": 7.26875810714055e-06, + "loss": 0.1654, + "step": 2571 + }, + { + "epoch": 8.113296616837136, + "grad_norm": 0.28213249790664724, + "learning_rate": 7.25819473951936e-06, + "loss": 0.1754, + "step": 2572 + }, + { + "epoch": 8.116443745082613, + "grad_norm": 0.22998099293791194, + "learning_rate": 7.247646769130079e-06, + "loss": 0.1657, + "step": 2573 + }, + { + "epoch": 8.119590873328088, + "grad_norm": 0.228994767720687, + "learning_rate": 7.237114206989646e-06, + "loss": 0.1612, + "step": 2574 + }, + { + "epoch": 8.122738001573564, + "grad_norm": 0.22828083602687194, + "learning_rate": 7.226597064098905e-06, + "loss": 0.1579, + "step": 2575 + }, + { + "epoch": 8.12588512981904, + "grad_norm": 0.2296606678197253, + "learning_rate": 7.216095351442604e-06, + "loss": 0.164, + "step": 2576 + }, + { + "epoch": 8.129032258064516, + "grad_norm": 0.22801101601758597, + "learning_rate": 7.205609079989353e-06, + "loss": 0.1659, + "step": 2577 + }, + { + "epoch": 8.132179386309993, + "grad_norm": 0.2375397466097386, + "learning_rate": 7.195138260691652e-06, + "loss": 0.1615, + "step": 2578 + }, + { + "epoch": 8.135326514555468, + "grad_norm": 0.24227397129094566, + "learning_rate": 7.184682904485862e-06, + "loss": 0.1659, + "step": 2579 + }, + { + "epoch": 8.138473642800944, + "grad_norm": 0.23277206582166304, + "learning_rate": 7.1742430222921834e-06, + "loss": 0.1593, + "step": 2580 + }, + { + "epoch": 8.141620771046421, + "grad_norm": 0.22865926008781765, + "learning_rate": 7.163818625014662e-06, + "loss": 0.1615, + "step": 2581 + }, + { + "epoch": 8.144767899291896, + "grad_norm": 0.2294968243718884, + "learning_rate": 7.1534097235411674e-06, + "loss": 0.1643, + "step": 2582 + }, + { + "epoch": 8.147915027537373, + "grad_norm": 0.22695319257199334, + "learning_rate": 7.143016328743384e-06, + "loss": 0.1676, + "step": 2583 + }, + { + "epoch": 8.151062155782848, + "grad_norm": 0.2305261334568713, + "learning_rate": 7.132638451476801e-06, + "loss": 0.1716, + "step": 2584 + }, + { + "epoch": 8.154209284028324, + "grad_norm": 0.22658443236517037, + "learning_rate": 7.122276102580698e-06, + "loss": 0.1693, + "step": 2585 + }, + { + "epoch": 8.157356412273801, + "grad_norm": 0.23678326060091193, + "learning_rate": 7.111929292878147e-06, + "loss": 0.1659, + "step": 2586 + }, + { + "epoch": 8.160503540519276, + "grad_norm": 0.23144801338294188, + "learning_rate": 7.101598033175973e-06, + "loss": 0.1667, + "step": 2587 + }, + { + "epoch": 8.163650668764753, + "grad_norm": 0.2255681221831429, + "learning_rate": 7.091282334264773e-06, + "loss": 0.1684, + "step": 2588 + }, + { + "epoch": 8.166797797010227, + "grad_norm": 0.23550516194771806, + "learning_rate": 7.080982206918873e-06, + "loss": 0.1624, + "step": 2589 + }, + { + "epoch": 8.169944925255704, + "grad_norm": 0.23560675125042624, + "learning_rate": 7.070697661896368e-06, + "loss": 0.1597, + "step": 2590 + }, + { + "epoch": 8.17309205350118, + "grad_norm": 0.231816404825124, + "learning_rate": 7.060428709939047e-06, + "loss": 0.1648, + "step": 2591 + }, + { + "epoch": 8.176239181746656, + "grad_norm": 0.23355018538739725, + "learning_rate": 7.050175361772427e-06, + "loss": 0.1626, + "step": 2592 + }, + { + "epoch": 8.179386309992132, + "grad_norm": 0.24417933186787055, + "learning_rate": 7.039937628105717e-06, + "loss": 0.1651, + "step": 2593 + }, + { + "epoch": 8.182533438237607, + "grad_norm": 0.22574700897136932, + "learning_rate": 7.029715519631832e-06, + "loss": 0.1671, + "step": 2594 + }, + { + "epoch": 8.185680566483084, + "grad_norm": 0.2304324967468494, + "learning_rate": 7.019509047027362e-06, + "loss": 0.1672, + "step": 2595 + }, + { + "epoch": 8.18882769472856, + "grad_norm": 0.2287503066121252, + "learning_rate": 7.0093182209525525e-06, + "loss": 0.1627, + "step": 2596 + }, + { + "epoch": 8.191974822974036, + "grad_norm": 0.2242905190122735, + "learning_rate": 6.9991430520513306e-06, + "loss": 0.1577, + "step": 2597 + }, + { + "epoch": 8.195121951219512, + "grad_norm": 0.2475609011866222, + "learning_rate": 6.988983550951245e-06, + "loss": 0.1644, + "step": 2598 + }, + { + "epoch": 8.198269079464989, + "grad_norm": 0.23890330471937485, + "learning_rate": 6.9788397282635044e-06, + "loss": 0.1644, + "step": 2599 + }, + { + "epoch": 8.201416207710464, + "grad_norm": 0.2386410693285585, + "learning_rate": 6.968711594582919e-06, + "loss": 0.164, + "step": 2600 + }, + { + "epoch": 8.20456333595594, + "grad_norm": 0.2389681606873896, + "learning_rate": 6.958599160487927e-06, + "loss": 0.1623, + "step": 2601 + }, + { + "epoch": 8.207710464201416, + "grad_norm": 0.22261291520501994, + "learning_rate": 6.948502436540572e-06, + "loss": 0.159, + "step": 2602 + }, + { + "epoch": 8.210857592446892, + "grad_norm": 0.22847054610493028, + "learning_rate": 6.93842143328647e-06, + "loss": 0.1602, + "step": 2603 + }, + { + "epoch": 8.214004720692369, + "grad_norm": 0.233317294983424, + "learning_rate": 6.928356161254845e-06, + "loss": 0.162, + "step": 2604 + }, + { + "epoch": 8.217151848937844, + "grad_norm": 0.23035769556977229, + "learning_rate": 6.91830663095846e-06, + "loss": 0.1653, + "step": 2605 + }, + { + "epoch": 8.22029897718332, + "grad_norm": 0.2348863615456292, + "learning_rate": 6.908272852893666e-06, + "loss": 0.1708, + "step": 2606 + }, + { + "epoch": 8.223446105428796, + "grad_norm": 0.22854836342550924, + "learning_rate": 6.898254837540333e-06, + "loss": 0.169, + "step": 2607 + }, + { + "epoch": 8.226593233674272, + "grad_norm": 0.22732100111251707, + "learning_rate": 6.888252595361895e-06, + "loss": 0.164, + "step": 2608 + }, + { + "epoch": 8.229740361919749, + "grad_norm": 0.2211306149730069, + "learning_rate": 6.878266136805284e-06, + "loss": 0.1649, + "step": 2609 + }, + { + "epoch": 8.232887490165224, + "grad_norm": 0.23144901547851038, + "learning_rate": 6.86829547230097e-06, + "loss": 0.1672, + "step": 2610 + }, + { + "epoch": 8.2360346184107, + "grad_norm": 0.23413830665280178, + "learning_rate": 6.858340612262916e-06, + "loss": 0.1644, + "step": 2611 + }, + { + "epoch": 8.239181746656175, + "grad_norm": 0.2245328322035716, + "learning_rate": 6.848401567088575e-06, + "loss": 0.1623, + "step": 2612 + }, + { + "epoch": 8.242328874901652, + "grad_norm": 0.2351914752245803, + "learning_rate": 6.838478347158893e-06, + "loss": 0.1568, + "step": 2613 + }, + { + "epoch": 8.245476003147129, + "grad_norm": 0.22741744869087863, + "learning_rate": 6.828570962838271e-06, + "loss": 0.1647, + "step": 2614 + }, + { + "epoch": 8.248623131392604, + "grad_norm": 0.234581482295964, + "learning_rate": 6.81867942447459e-06, + "loss": 0.1625, + "step": 2615 + }, + { + "epoch": 8.25177025963808, + "grad_norm": 0.23787978249548633, + "learning_rate": 6.808803742399162e-06, + "loss": 0.1643, + "step": 2616 + }, + { + "epoch": 8.254917387883557, + "grad_norm": 0.22558874923524821, + "learning_rate": 6.798943926926748e-06, + "loss": 0.1655, + "step": 2617 + }, + { + "epoch": 8.258064516129032, + "grad_norm": 0.23066847653534014, + "learning_rate": 6.7890999883555365e-06, + "loss": 0.1598, + "step": 2618 + }, + { + "epoch": 8.261211644374509, + "grad_norm": 0.24121585112670477, + "learning_rate": 6.779271936967129e-06, + "loss": 0.1671, + "step": 2619 + }, + { + "epoch": 8.264358772619984, + "grad_norm": 0.23631017657037634, + "learning_rate": 6.769459783026544e-06, + "loss": 0.1662, + "step": 2620 + }, + { + "epoch": 8.26750590086546, + "grad_norm": 0.2391507536037999, + "learning_rate": 6.759663536782177e-06, + "loss": 0.1666, + "step": 2621 + }, + { + "epoch": 8.270653029110937, + "grad_norm": 0.22647269595449845, + "learning_rate": 6.74988320846583e-06, + "loss": 0.1646, + "step": 2622 + }, + { + "epoch": 8.273800157356412, + "grad_norm": 0.23536140779260983, + "learning_rate": 6.740118808292657e-06, + "loss": 0.174, + "step": 2623 + }, + { + "epoch": 8.276947285601889, + "grad_norm": 0.2186614203495139, + "learning_rate": 6.730370346461198e-06, + "loss": 0.1717, + "step": 2624 + }, + { + "epoch": 8.280094413847364, + "grad_norm": 0.2322550042532507, + "learning_rate": 6.720637833153325e-06, + "loss": 0.1659, + "step": 2625 + }, + { + "epoch": 8.28324154209284, + "grad_norm": 0.23867083019820706, + "learning_rate": 6.710921278534269e-06, + "loss": 0.164, + "step": 2626 + }, + { + "epoch": 8.286388670338317, + "grad_norm": 0.23732190600347555, + "learning_rate": 6.7012206927525926e-06, + "loss": 0.1683, + "step": 2627 + }, + { + "epoch": 8.289535798583792, + "grad_norm": 0.23792842922753502, + "learning_rate": 6.69153608594016e-06, + "loss": 0.1552, + "step": 2628 + }, + { + "epoch": 8.292682926829269, + "grad_norm": 0.2310163348962354, + "learning_rate": 6.681867468212171e-06, + "loss": 0.1669, + "step": 2629 + }, + { + "epoch": 8.295830055074743, + "grad_norm": 0.22593964960603174, + "learning_rate": 6.672214849667107e-06, + "loss": 0.1649, + "step": 2630 + }, + { + "epoch": 8.29897718332022, + "grad_norm": 0.23274451068025068, + "learning_rate": 6.66257824038675e-06, + "loss": 0.1644, + "step": 2631 + }, + { + "epoch": 8.302124311565697, + "grad_norm": 0.2276653833702071, + "learning_rate": 6.652957650436149e-06, + "loss": 0.1631, + "step": 2632 + }, + { + "epoch": 8.305271439811172, + "grad_norm": 0.22912711462824803, + "learning_rate": 6.643353089863644e-06, + "loss": 0.1673, + "step": 2633 + }, + { + "epoch": 8.308418568056648, + "grad_norm": 0.2374453905619608, + "learning_rate": 6.633764568700805e-06, + "loss": 0.1633, + "step": 2634 + }, + { + "epoch": 8.311565696302125, + "grad_norm": 0.23454962799092569, + "learning_rate": 6.624192096962468e-06, + "loss": 0.1578, + "step": 2635 + }, + { + "epoch": 8.3147128245476, + "grad_norm": 0.22400278973493876, + "learning_rate": 6.614635684646704e-06, + "loss": 0.1665, + "step": 2636 + }, + { + "epoch": 8.317859952793077, + "grad_norm": 0.2367047312346527, + "learning_rate": 6.6050953417348e-06, + "loss": 0.1659, + "step": 2637 + }, + { + "epoch": 8.321007081038552, + "grad_norm": 0.2396724707707527, + "learning_rate": 6.595571078191273e-06, + "loss": 0.1618, + "step": 2638 + }, + { + "epoch": 8.324154209284028, + "grad_norm": 0.23140731829793698, + "learning_rate": 6.586062903963832e-06, + "loss": 0.1653, + "step": 2639 + }, + { + "epoch": 8.327301337529505, + "grad_norm": 0.2402337835141489, + "learning_rate": 6.576570828983397e-06, + "loss": 0.1685, + "step": 2640 + }, + { + "epoch": 8.33044846577498, + "grad_norm": 0.24165802205944656, + "learning_rate": 6.5670948631640575e-06, + "loss": 0.1714, + "step": 2641 + }, + { + "epoch": 8.333595594020457, + "grad_norm": 0.22826464282932363, + "learning_rate": 6.557635016403086e-06, + "loss": 0.1655, + "step": 2642 + }, + { + "epoch": 8.336742722265932, + "grad_norm": 0.232516141417856, + "learning_rate": 6.548191298580923e-06, + "loss": 0.1644, + "step": 2643 + }, + { + "epoch": 8.339889850511408, + "grad_norm": 0.22800084761954714, + "learning_rate": 6.538763719561149e-06, + "loss": 0.1725, + "step": 2644 + }, + { + "epoch": 8.343036978756885, + "grad_norm": 0.23124756998898505, + "learning_rate": 6.529352289190507e-06, + "loss": 0.1669, + "step": 2645 + }, + { + "epoch": 8.34618410700236, + "grad_norm": 0.23164406300800636, + "learning_rate": 6.51995701729885e-06, + "loss": 0.1606, + "step": 2646 + }, + { + "epoch": 8.349331235247837, + "grad_norm": 0.23456181103996557, + "learning_rate": 6.510577913699186e-06, + "loss": 0.1626, + "step": 2647 + }, + { + "epoch": 8.352478363493312, + "grad_norm": 0.22874955849477083, + "learning_rate": 6.501214988187601e-06, + "loss": 0.1624, + "step": 2648 + }, + { + "epoch": 8.355625491738788, + "grad_norm": 0.23854686972912492, + "learning_rate": 6.491868250543312e-06, + "loss": 0.1642, + "step": 2649 + }, + { + "epoch": 8.358772619984265, + "grad_norm": 0.2372645112138056, + "learning_rate": 6.4825377105286044e-06, + "loss": 0.1655, + "step": 2650 + }, + { + "epoch": 8.36191974822974, + "grad_norm": 0.2319500615137312, + "learning_rate": 6.473223377888865e-06, + "loss": 0.1701, + "step": 2651 + }, + { + "epoch": 8.365066876475217, + "grad_norm": 0.23635738733339692, + "learning_rate": 6.463925262352549e-06, + "loss": 0.1648, + "step": 2652 + }, + { + "epoch": 8.368214004720693, + "grad_norm": 0.22916613347850073, + "learning_rate": 6.454643373631161e-06, + "loss": 0.167, + "step": 2653 + }, + { + "epoch": 8.371361132966168, + "grad_norm": 0.2343271870579212, + "learning_rate": 6.445377721419274e-06, + "loss": 0.1687, + "step": 2654 + }, + { + "epoch": 8.374508261211645, + "grad_norm": 0.23188613838557393, + "learning_rate": 6.436128315394487e-06, + "loss": 0.1626, + "step": 2655 + }, + { + "epoch": 8.37765538945712, + "grad_norm": 0.23834493012382535, + "learning_rate": 6.426895165217448e-06, + "loss": 0.17, + "step": 2656 + }, + { + "epoch": 8.380802517702596, + "grad_norm": 0.23537207044394082, + "learning_rate": 6.417678280531808e-06, + "loss": 0.1623, + "step": 2657 + }, + { + "epoch": 8.383949645948073, + "grad_norm": 0.23842873789085556, + "learning_rate": 6.408477670964244e-06, + "loss": 0.1671, + "step": 2658 + }, + { + "epoch": 8.387096774193548, + "grad_norm": 0.22315005761815868, + "learning_rate": 6.399293346124427e-06, + "loss": 0.1648, + "step": 2659 + }, + { + "epoch": 8.390243902439025, + "grad_norm": 0.2369572676127876, + "learning_rate": 6.390125315605016e-06, + "loss": 0.1669, + "step": 2660 + }, + { + "epoch": 8.3933910306845, + "grad_norm": 0.23656957128379635, + "learning_rate": 6.380973588981662e-06, + "loss": 0.1658, + "step": 2661 + }, + { + "epoch": 8.396538158929976, + "grad_norm": 0.23549710286212458, + "learning_rate": 6.371838175812977e-06, + "loss": 0.165, + "step": 2662 + }, + { + "epoch": 8.399685287175453, + "grad_norm": 0.23821077849443947, + "learning_rate": 6.362719085640544e-06, + "loss": 0.1644, + "step": 2663 + }, + { + "epoch": 8.402832415420928, + "grad_norm": 0.23812845227381751, + "learning_rate": 6.353616327988885e-06, + "loss": 0.1695, + "step": 2664 + }, + { + "epoch": 8.405979543666405, + "grad_norm": 0.23151702473551566, + "learning_rate": 6.344529912365477e-06, + "loss": 0.1664, + "step": 2665 + }, + { + "epoch": 8.40912667191188, + "grad_norm": 0.23515427535640315, + "learning_rate": 6.335459848260712e-06, + "loss": 0.1628, + "step": 2666 + }, + { + "epoch": 8.412273800157356, + "grad_norm": 0.24148276293516086, + "learning_rate": 6.326406145147919e-06, + "loss": 0.165, + "step": 2667 + }, + { + "epoch": 8.415420928402833, + "grad_norm": 0.22961657881825784, + "learning_rate": 6.3173688124833354e-06, + "loss": 0.1566, + "step": 2668 + }, + { + "epoch": 8.418568056648308, + "grad_norm": 0.23330366260196786, + "learning_rate": 6.3083478597060895e-06, + "loss": 0.1679, + "step": 2669 + }, + { + "epoch": 8.421715184893785, + "grad_norm": 0.23139753589023687, + "learning_rate": 6.299343296238215e-06, + "loss": 0.1715, + "step": 2670 + }, + { + "epoch": 8.424862313139261, + "grad_norm": 0.23233022652711008, + "learning_rate": 6.290355131484619e-06, + "loss": 0.1625, + "step": 2671 + }, + { + "epoch": 8.428009441384736, + "grad_norm": 0.23192930432672745, + "learning_rate": 6.281383374833088e-06, + "loss": 0.1661, + "step": 2672 + }, + { + "epoch": 8.431156569630213, + "grad_norm": 0.237960894230701, + "learning_rate": 6.272428035654258e-06, + "loss": 0.1664, + "step": 2673 + }, + { + "epoch": 8.434303697875688, + "grad_norm": 0.23781605768359015, + "learning_rate": 6.263489123301633e-06, + "loss": 0.1682, + "step": 2674 + }, + { + "epoch": 8.437450826121164, + "grad_norm": 0.23308152261123055, + "learning_rate": 6.254566647111552e-06, + "loss": 0.1684, + "step": 2675 + }, + { + "epoch": 8.440597954366641, + "grad_norm": 0.24050735086700006, + "learning_rate": 6.2456606164031865e-06, + "loss": 0.1691, + "step": 2676 + }, + { + "epoch": 8.443745082612116, + "grad_norm": 0.2332745071775824, + "learning_rate": 6.23677104047854e-06, + "loss": 0.1684, + "step": 2677 + }, + { + "epoch": 8.446892210857593, + "grad_norm": 0.23439482464949807, + "learning_rate": 6.22789792862241e-06, + "loss": 0.1644, + "step": 2678 + }, + { + "epoch": 8.450039339103068, + "grad_norm": 0.22619429799525462, + "learning_rate": 6.219041290102423e-06, + "loss": 0.1633, + "step": 2679 + }, + { + "epoch": 8.453186467348544, + "grad_norm": 0.23111017539558812, + "learning_rate": 6.210201134168976e-06, + "loss": 0.1686, + "step": 2680 + }, + { + "epoch": 8.456333595594021, + "grad_norm": 0.23077386898729263, + "learning_rate": 6.201377470055274e-06, + "loss": 0.1643, + "step": 2681 + }, + { + "epoch": 8.459480723839496, + "grad_norm": 0.2314358817989027, + "learning_rate": 6.192570306977274e-06, + "loss": 0.1659, + "step": 2682 + }, + { + "epoch": 8.462627852084973, + "grad_norm": 0.2429161146850131, + "learning_rate": 6.183779654133711e-06, + "loss": 0.1658, + "step": 2683 + }, + { + "epoch": 8.465774980330448, + "grad_norm": 0.22921232924755053, + "learning_rate": 6.175005520706083e-06, + "loss": 0.1753, + "step": 2684 + }, + { + "epoch": 8.468922108575924, + "grad_norm": 0.23660956898638807, + "learning_rate": 6.166247915858612e-06, + "loss": 0.1641, + "step": 2685 + }, + { + "epoch": 8.472069236821401, + "grad_norm": 0.23359602867885043, + "learning_rate": 6.157506848738281e-06, + "loss": 0.1663, + "step": 2686 + }, + { + "epoch": 8.475216365066876, + "grad_norm": 0.22916515054110842, + "learning_rate": 6.148782328474779e-06, + "loss": 0.1681, + "step": 2687 + }, + { + "epoch": 8.478363493312353, + "grad_norm": 0.2341655050876696, + "learning_rate": 6.1400743641805295e-06, + "loss": 0.1637, + "step": 2688 + }, + { + "epoch": 8.48151062155783, + "grad_norm": 0.2326657826844374, + "learning_rate": 6.131382964950646e-06, + "loss": 0.1714, + "step": 2689 + }, + { + "epoch": 8.484657749803304, + "grad_norm": 0.22139519017417883, + "learning_rate": 6.122708139862964e-06, + "loss": 0.1644, + "step": 2690 + }, + { + "epoch": 8.487804878048781, + "grad_norm": 0.2358515865230787, + "learning_rate": 6.114049897977987e-06, + "loss": 0.1678, + "step": 2691 + }, + { + "epoch": 8.490952006294256, + "grad_norm": 0.24072618363900117, + "learning_rate": 6.105408248338907e-06, + "loss": 0.1652, + "step": 2692 + }, + { + "epoch": 8.494099134539733, + "grad_norm": 0.23833730230772185, + "learning_rate": 6.0967831999715895e-06, + "loss": 0.1653, + "step": 2693 + }, + { + "epoch": 8.49724626278521, + "grad_norm": 0.23569737870390947, + "learning_rate": 6.088174761884547e-06, + "loss": 0.1676, + "step": 2694 + }, + { + "epoch": 8.500393391030684, + "grad_norm": 0.2362096527557548, + "learning_rate": 6.079582943068963e-06, + "loss": 0.1613, + "step": 2695 + }, + { + "epoch": 8.50354051927616, + "grad_norm": 0.2276250499406588, + "learning_rate": 6.07100775249864e-06, + "loss": 0.1679, + "step": 2696 + }, + { + "epoch": 8.506687647521636, + "grad_norm": 0.23255051930242077, + "learning_rate": 6.062449199130038e-06, + "loss": 0.158, + "step": 2697 + }, + { + "epoch": 8.509834775767112, + "grad_norm": 0.23823745767581703, + "learning_rate": 6.053907291902215e-06, + "loss": 0.1675, + "step": 2698 + }, + { + "epoch": 8.51298190401259, + "grad_norm": 0.22780299459891643, + "learning_rate": 6.04538203973686e-06, + "loss": 0.167, + "step": 2699 + }, + { + "epoch": 8.516129032258064, + "grad_norm": 0.23195173113273695, + "learning_rate": 6.036873451538268e-06, + "loss": 0.1604, + "step": 2700 + }, + { + "epoch": 8.51927616050354, + "grad_norm": 0.22611245875521443, + "learning_rate": 6.02838153619331e-06, + "loss": 0.1672, + "step": 2701 + }, + { + "epoch": 8.522423288749017, + "grad_norm": 0.22775959921131073, + "learning_rate": 6.019906302571467e-06, + "loss": 0.1641, + "step": 2702 + }, + { + "epoch": 8.525570416994492, + "grad_norm": 0.23041588221663856, + "learning_rate": 6.011447759524776e-06, + "loss": 0.1683, + "step": 2703 + }, + { + "epoch": 8.528717545239969, + "grad_norm": 0.22838434499944277, + "learning_rate": 6.003005915887853e-06, + "loss": 0.1637, + "step": 2704 + }, + { + "epoch": 8.531864673485444, + "grad_norm": 0.23218601962132057, + "learning_rate": 5.99458078047787e-06, + "loss": 0.1701, + "step": 2705 + }, + { + "epoch": 8.53501180173092, + "grad_norm": 0.2416174473030719, + "learning_rate": 5.986172362094551e-06, + "loss": 0.1653, + "step": 2706 + }, + { + "epoch": 8.538158929976397, + "grad_norm": 0.232552051298067, + "learning_rate": 5.977780669520149e-06, + "loss": 0.1673, + "step": 2707 + }, + { + "epoch": 8.541306058221872, + "grad_norm": 0.2345354384269151, + "learning_rate": 5.96940571151946e-06, + "loss": 0.1597, + "step": 2708 + }, + { + "epoch": 8.544453186467349, + "grad_norm": 0.2352724122845352, + "learning_rate": 5.961047496839797e-06, + "loss": 0.17, + "step": 2709 + }, + { + "epoch": 8.547600314712824, + "grad_norm": 0.2347397790865505, + "learning_rate": 5.952706034210978e-06, + "loss": 0.1654, + "step": 2710 + }, + { + "epoch": 8.5507474429583, + "grad_norm": 0.23099011233464176, + "learning_rate": 5.944381332345337e-06, + "loss": 0.1693, + "step": 2711 + }, + { + "epoch": 8.553894571203777, + "grad_norm": 0.23436885807183397, + "learning_rate": 5.93607339993769e-06, + "loss": 0.1625, + "step": 2712 + }, + { + "epoch": 8.557041699449252, + "grad_norm": 0.23460704187792045, + "learning_rate": 5.92778224566535e-06, + "loss": 0.1661, + "step": 2713 + }, + { + "epoch": 8.560188827694729, + "grad_norm": 0.23592554127140355, + "learning_rate": 5.919507878188092e-06, + "loss": 0.1681, + "step": 2714 + }, + { + "epoch": 8.563335955940204, + "grad_norm": 0.22436344128981697, + "learning_rate": 5.9112503061481685e-06, + "loss": 0.1681, + "step": 2715 + }, + { + "epoch": 8.56648308418568, + "grad_norm": 0.23478075715615096, + "learning_rate": 5.903009538170289e-06, + "loss": 0.1697, + "step": 2716 + }, + { + "epoch": 8.569630212431157, + "grad_norm": 0.23791774157827272, + "learning_rate": 5.894785582861606e-06, + "loss": 0.1679, + "step": 2717 + }, + { + "epoch": 8.572777340676632, + "grad_norm": 0.2411394751490439, + "learning_rate": 5.886578448811714e-06, + "loss": 0.167, + "step": 2718 + }, + { + "epoch": 8.575924468922109, + "grad_norm": 0.22846188636305548, + "learning_rate": 5.878388144592642e-06, + "loss": 0.1691, + "step": 2719 + }, + { + "epoch": 8.579071597167584, + "grad_norm": 0.2385881186153001, + "learning_rate": 5.8702146787588435e-06, + "loss": 0.1655, + "step": 2720 + }, + { + "epoch": 8.58221872541306, + "grad_norm": 0.24030991985714578, + "learning_rate": 5.862058059847169e-06, + "loss": 0.1724, + "step": 2721 + }, + { + "epoch": 8.585365853658537, + "grad_norm": 0.22562331859076523, + "learning_rate": 5.8539182963768935e-06, + "loss": 0.1673, + "step": 2722 + }, + { + "epoch": 8.588512981904012, + "grad_norm": 0.22799397594939672, + "learning_rate": 5.845795396849671e-06, + "loss": 0.1625, + "step": 2723 + }, + { + "epoch": 8.591660110149489, + "grad_norm": 0.22658460250777282, + "learning_rate": 5.837689369749554e-06, + "loss": 0.1672, + "step": 2724 + }, + { + "epoch": 8.594807238394965, + "grad_norm": 0.23287534822002212, + "learning_rate": 5.829600223542965e-06, + "loss": 0.167, + "step": 2725 + }, + { + "epoch": 8.59795436664044, + "grad_norm": 0.23513451530619064, + "learning_rate": 5.821527966678693e-06, + "loss": 0.1604, + "step": 2726 + }, + { + "epoch": 8.601101494885917, + "grad_norm": 0.24331885939351217, + "learning_rate": 5.8134726075878965e-06, + "loss": 0.1669, + "step": 2727 + }, + { + "epoch": 8.604248623131392, + "grad_norm": 0.23480791746582516, + "learning_rate": 5.805434154684075e-06, + "loss": 0.1631, + "step": 2728 + }, + { + "epoch": 8.607395751376869, + "grad_norm": 0.2313813224235784, + "learning_rate": 5.797412616363077e-06, + "loss": 0.1718, + "step": 2729 + }, + { + "epoch": 8.610542879622345, + "grad_norm": 0.23469324508654915, + "learning_rate": 5.789408001003079e-06, + "loss": 0.1645, + "step": 2730 + }, + { + "epoch": 8.61369000786782, + "grad_norm": 0.22766690046481194, + "learning_rate": 5.781420316964586e-06, + "loss": 0.1641, + "step": 2731 + }, + { + "epoch": 8.616837136113297, + "grad_norm": 0.22244068111472892, + "learning_rate": 5.773449572590417e-06, + "loss": 0.1677, + "step": 2732 + }, + { + "epoch": 8.619984264358772, + "grad_norm": 0.22626928201892044, + "learning_rate": 5.7654957762056994e-06, + "loss": 0.1658, + "step": 2733 + }, + { + "epoch": 8.623131392604249, + "grad_norm": 0.23558057122663417, + "learning_rate": 5.7575589361178645e-06, + "loss": 0.1623, + "step": 2734 + }, + { + "epoch": 8.626278520849725, + "grad_norm": 0.22952982394885552, + "learning_rate": 5.749639060616618e-06, + "loss": 0.1654, + "step": 2735 + }, + { + "epoch": 8.6294256490952, + "grad_norm": 0.2260057004402793, + "learning_rate": 5.74173615797396e-06, + "loss": 0.1611, + "step": 2736 + }, + { + "epoch": 8.632572777340677, + "grad_norm": 0.22660148535255212, + "learning_rate": 5.733850236444161e-06, + "loss": 0.1654, + "step": 2737 + }, + { + "epoch": 8.635719905586154, + "grad_norm": 0.2275035237099505, + "learning_rate": 5.725981304263756e-06, + "loss": 0.1704, + "step": 2738 + }, + { + "epoch": 8.638867033831628, + "grad_norm": 0.23670395896446667, + "learning_rate": 5.718129369651524e-06, + "loss": 0.1683, + "step": 2739 + }, + { + "epoch": 8.642014162077105, + "grad_norm": 0.2373653142814522, + "learning_rate": 5.710294440808507e-06, + "loss": 0.1721, + "step": 2740 + }, + { + "epoch": 8.64516129032258, + "grad_norm": 0.23104346401189593, + "learning_rate": 5.702476525917979e-06, + "loss": 0.1663, + "step": 2741 + }, + { + "epoch": 8.648308418568057, + "grad_norm": 0.23283488730211302, + "learning_rate": 5.6946756331454354e-06, + "loss": 0.1668, + "step": 2742 + }, + { + "epoch": 8.651455546813533, + "grad_norm": 0.2311838977095239, + "learning_rate": 5.6868917706386105e-06, + "loss": 0.1747, + "step": 2743 + }, + { + "epoch": 8.654602675059008, + "grad_norm": 0.2241606956720152, + "learning_rate": 5.67912494652743e-06, + "loss": 0.1641, + "step": 2744 + }, + { + "epoch": 8.657749803304485, + "grad_norm": 0.23193312446144088, + "learning_rate": 5.671375168924041e-06, + "loss": 0.1696, + "step": 2745 + }, + { + "epoch": 8.66089693154996, + "grad_norm": 0.23447145957791274, + "learning_rate": 5.663642445922777e-06, + "loss": 0.1699, + "step": 2746 + }, + { + "epoch": 8.664044059795437, + "grad_norm": 0.22987054795106973, + "learning_rate": 5.655926785600158e-06, + "loss": 0.1612, + "step": 2747 + }, + { + "epoch": 8.667191188040913, + "grad_norm": 0.2352484784071021, + "learning_rate": 5.648228196014888e-06, + "loss": 0.1674, + "step": 2748 + }, + { + "epoch": 8.670338316286388, + "grad_norm": 0.23150853204839367, + "learning_rate": 5.640546685207842e-06, + "loss": 0.1677, + "step": 2749 + }, + { + "epoch": 8.673485444531865, + "grad_norm": 0.2319763405157472, + "learning_rate": 5.632882261202054e-06, + "loss": 0.1627, + "step": 2750 + }, + { + "epoch": 8.67663257277734, + "grad_norm": 0.23040198433732714, + "learning_rate": 5.625234932002706e-06, + "loss": 0.1641, + "step": 2751 + }, + { + "epoch": 8.679779701022817, + "grad_norm": 0.24040413406804584, + "learning_rate": 5.617604705597136e-06, + "loss": 0.166, + "step": 2752 + }, + { + "epoch": 8.682926829268293, + "grad_norm": 0.23110146532134304, + "learning_rate": 5.609991589954809e-06, + "loss": 0.1683, + "step": 2753 + }, + { + "epoch": 8.686073957513768, + "grad_norm": 0.22419932903394638, + "learning_rate": 5.602395593027327e-06, + "loss": 0.1716, + "step": 2754 + }, + { + "epoch": 8.689221085759245, + "grad_norm": 0.22482368034008485, + "learning_rate": 5.594816722748403e-06, + "loss": 0.1612, + "step": 2755 + }, + { + "epoch": 8.69236821400472, + "grad_norm": 0.22947954480527974, + "learning_rate": 5.58725498703387e-06, + "loss": 0.1703, + "step": 2756 + }, + { + "epoch": 8.695515342250197, + "grad_norm": 0.234698861914943, + "learning_rate": 5.579710393781666e-06, + "loss": 0.168, + "step": 2757 + }, + { + "epoch": 8.698662470495673, + "grad_norm": 0.24098945022071477, + "learning_rate": 5.5721829508718095e-06, + "loss": 0.1665, + "step": 2758 + }, + { + "epoch": 8.701809598741148, + "grad_norm": 0.2371662907560327, + "learning_rate": 5.564672666166425e-06, + "loss": 0.1667, + "step": 2759 + }, + { + "epoch": 8.704956726986625, + "grad_norm": 0.23429888837701635, + "learning_rate": 5.557179547509703e-06, + "loss": 0.1718, + "step": 2760 + }, + { + "epoch": 8.708103855232102, + "grad_norm": 0.23129876881925154, + "learning_rate": 5.549703602727912e-06, + "loss": 0.1746, + "step": 2761 + }, + { + "epoch": 8.711250983477576, + "grad_norm": 0.23080453324233027, + "learning_rate": 5.542244839629379e-06, + "loss": 0.1654, + "step": 2762 + }, + { + "epoch": 8.714398111723053, + "grad_norm": 0.23544727546366193, + "learning_rate": 5.534803266004491e-06, + "loss": 0.1698, + "step": 2763 + }, + { + "epoch": 8.717545239968528, + "grad_norm": 0.23363093531020357, + "learning_rate": 5.527378889625668e-06, + "loss": 0.1647, + "step": 2764 + }, + { + "epoch": 8.720692368214005, + "grad_norm": 0.23435894467290583, + "learning_rate": 5.519971718247384e-06, + "loss": 0.163, + "step": 2765 + }, + { + "epoch": 8.723839496459481, + "grad_norm": 0.23005445022151622, + "learning_rate": 5.512581759606137e-06, + "loss": 0.1648, + "step": 2766 + }, + { + "epoch": 8.726986624704956, + "grad_norm": 0.22957271730336687, + "learning_rate": 5.50520902142044e-06, + "loss": 0.1666, + "step": 2767 + }, + { + "epoch": 8.730133752950433, + "grad_norm": 0.22954716917821802, + "learning_rate": 5.497853511390836e-06, + "loss": 0.1688, + "step": 2768 + }, + { + "epoch": 8.733280881195908, + "grad_norm": 0.22392571792472526, + "learning_rate": 5.490515237199852e-06, + "loss": 0.17, + "step": 2769 + }, + { + "epoch": 8.736428009441385, + "grad_norm": 0.2355921505287895, + "learning_rate": 5.483194206512034e-06, + "loss": 0.1662, + "step": 2770 + }, + { + "epoch": 8.739575137686861, + "grad_norm": 0.23974943062847195, + "learning_rate": 5.475890426973903e-06, + "loss": 0.1694, + "step": 2771 + }, + { + "epoch": 8.742722265932336, + "grad_norm": 0.2338669519112205, + "learning_rate": 5.46860390621397e-06, + "loss": 0.1659, + "step": 2772 + }, + { + "epoch": 8.745869394177813, + "grad_norm": 0.23127783986823608, + "learning_rate": 5.461334651842721e-06, + "loss": 0.1664, + "step": 2773 + }, + { + "epoch": 8.74901652242329, + "grad_norm": 0.23190110954348572, + "learning_rate": 5.454082671452597e-06, + "loss": 0.1676, + "step": 2774 + }, + { + "epoch": 8.752163650668765, + "grad_norm": 0.23152867010137812, + "learning_rate": 5.446847972618009e-06, + "loss": 0.1635, + "step": 2775 + }, + { + "epoch": 8.755310778914241, + "grad_norm": 0.23464418647093355, + "learning_rate": 5.439630562895311e-06, + "loss": 0.1601, + "step": 2776 + }, + { + "epoch": 8.758457907159716, + "grad_norm": 0.22904054716255184, + "learning_rate": 5.43243044982281e-06, + "loss": 0.1658, + "step": 2777 + }, + { + "epoch": 8.761605035405193, + "grad_norm": 0.22867471399286618, + "learning_rate": 5.425247640920726e-06, + "loss": 0.1677, + "step": 2778 + }, + { + "epoch": 8.76475216365067, + "grad_norm": 0.22791012026392768, + "learning_rate": 5.418082143691229e-06, + "loss": 0.1732, + "step": 2779 + }, + { + "epoch": 8.767899291896144, + "grad_norm": 0.21856098390948892, + "learning_rate": 5.410933965618389e-06, + "loss": 0.1648, + "step": 2780 + }, + { + "epoch": 8.771046420141621, + "grad_norm": 0.22405701257784277, + "learning_rate": 5.4038031141682e-06, + "loss": 0.1597, + "step": 2781 + }, + { + "epoch": 8.774193548387096, + "grad_norm": 0.23307681739252337, + "learning_rate": 5.396689596788556e-06, + "loss": 0.1675, + "step": 2782 + }, + { + "epoch": 8.777340676632573, + "grad_norm": 0.23277974511466729, + "learning_rate": 5.389593420909237e-06, + "loss": 0.1657, + "step": 2783 + }, + { + "epoch": 8.78048780487805, + "grad_norm": 0.2249371763607525, + "learning_rate": 5.382514593941926e-06, + "loss": 0.1667, + "step": 2784 + }, + { + "epoch": 8.783634933123524, + "grad_norm": 0.23060309789801092, + "learning_rate": 5.375453123280171e-06, + "loss": 0.1567, + "step": 2785 + }, + { + "epoch": 8.786782061369001, + "grad_norm": 0.22431101983410784, + "learning_rate": 5.368409016299404e-06, + "loss": 0.1646, + "step": 2786 + }, + { + "epoch": 8.789929189614476, + "grad_norm": 0.23707669405318257, + "learning_rate": 5.36138228035691e-06, + "loss": 0.1686, + "step": 2787 + }, + { + "epoch": 8.793076317859953, + "grad_norm": 0.22826592786983121, + "learning_rate": 5.3543729227918375e-06, + "loss": 0.167, + "step": 2788 + }, + { + "epoch": 8.79622344610543, + "grad_norm": 0.22647434359880703, + "learning_rate": 5.34738095092519e-06, + "loss": 0.1671, + "step": 2789 + }, + { + "epoch": 8.799370574350904, + "grad_norm": 0.23277753509076599, + "learning_rate": 5.340406372059793e-06, + "loss": 0.1694, + "step": 2790 + }, + { + "epoch": 8.802517702596381, + "grad_norm": 0.22966481306572614, + "learning_rate": 5.33344919348033e-06, + "loss": 0.1677, + "step": 2791 + }, + { + "epoch": 8.805664830841856, + "grad_norm": 0.2345330156945199, + "learning_rate": 5.3265094224532925e-06, + "loss": 0.1662, + "step": 2792 + }, + { + "epoch": 8.808811959087333, + "grad_norm": 0.24220581762981963, + "learning_rate": 5.319587066227e-06, + "loss": 0.1616, + "step": 2793 + }, + { + "epoch": 8.81195908733281, + "grad_norm": 0.23003977776252021, + "learning_rate": 5.312682132031575e-06, + "loss": 0.1692, + "step": 2794 + }, + { + "epoch": 8.815106215578284, + "grad_norm": 0.22663651876262159, + "learning_rate": 5.3057946270789504e-06, + "loss": 0.171, + "step": 2795 + }, + { + "epoch": 8.818253343823761, + "grad_norm": 0.23957359702108452, + "learning_rate": 5.298924558562852e-06, + "loss": 0.1653, + "step": 2796 + }, + { + "epoch": 8.821400472069238, + "grad_norm": 0.229995530933423, + "learning_rate": 5.292071933658794e-06, + "loss": 0.1608, + "step": 2797 + }, + { + "epoch": 8.824547600314713, + "grad_norm": 0.23340637605795178, + "learning_rate": 5.2852367595240735e-06, + "loss": 0.1727, + "step": 2798 + }, + { + "epoch": 8.82769472856019, + "grad_norm": 0.22650780265142523, + "learning_rate": 5.278419043297756e-06, + "loss": 0.1727, + "step": 2799 + }, + { + "epoch": 8.830841856805664, + "grad_norm": 0.23469188183263542, + "learning_rate": 5.271618792100679e-06, + "loss": 0.1595, + "step": 2800 + }, + { + "epoch": 8.83398898505114, + "grad_norm": 0.22902689381057262, + "learning_rate": 5.264836013035435e-06, + "loss": 0.1666, + "step": 2801 + }, + { + "epoch": 8.837136113296618, + "grad_norm": 0.24601196499502342, + "learning_rate": 5.25807071318637e-06, + "loss": 0.1675, + "step": 2802 + }, + { + "epoch": 8.840283241542092, + "grad_norm": 0.23570422675260183, + "learning_rate": 5.251322899619565e-06, + "loss": 0.1632, + "step": 2803 + }, + { + "epoch": 8.84343036978757, + "grad_norm": 0.23861546188691643, + "learning_rate": 5.2445925793828504e-06, + "loss": 0.1711, + "step": 2804 + }, + { + "epoch": 8.846577498033044, + "grad_norm": 0.22082821092360364, + "learning_rate": 5.237879759505778e-06, + "loss": 0.1712, + "step": 2805 + }, + { + "epoch": 8.84972462627852, + "grad_norm": 0.23120537316015438, + "learning_rate": 5.2311844469996205e-06, + "loss": 0.1664, + "step": 2806 + }, + { + "epoch": 8.852871754523997, + "grad_norm": 0.23260665872741915, + "learning_rate": 5.224506648857374e-06, + "loss": 0.1557, + "step": 2807 + }, + { + "epoch": 8.856018882769472, + "grad_norm": 0.2310320953143728, + "learning_rate": 5.217846372053722e-06, + "loss": 0.1701, + "step": 2808 + }, + { + "epoch": 8.859166011014949, + "grad_norm": 0.23800631732996927, + "learning_rate": 5.211203623545071e-06, + "loss": 0.166, + "step": 2809 + }, + { + "epoch": 8.862313139260426, + "grad_norm": 0.2230982585460558, + "learning_rate": 5.204578410269503e-06, + "loss": 0.1748, + "step": 2810 + }, + { + "epoch": 8.8654602675059, + "grad_norm": 0.23177916380091554, + "learning_rate": 5.197970739146792e-06, + "loss": 0.1667, + "step": 2811 + }, + { + "epoch": 8.868607395751377, + "grad_norm": 0.23008230747581662, + "learning_rate": 5.191380617078389e-06, + "loss": 0.1702, + "step": 2812 + }, + { + "epoch": 8.871754523996852, + "grad_norm": 0.22716372034675994, + "learning_rate": 5.184808050947413e-06, + "loss": 0.1627, + "step": 2813 + }, + { + "epoch": 8.874901652242329, + "grad_norm": 0.23335158310105836, + "learning_rate": 5.178253047618657e-06, + "loss": 0.1723, + "step": 2814 + }, + { + "epoch": 8.878048780487806, + "grad_norm": 0.22629652030237907, + "learning_rate": 5.171715613938553e-06, + "loss": 0.1665, + "step": 2815 + }, + { + "epoch": 8.88119590873328, + "grad_norm": 0.23433706877773142, + "learning_rate": 5.165195756735199e-06, + "loss": 0.1742, + "step": 2816 + }, + { + "epoch": 8.884343036978757, + "grad_norm": 0.2410492157482833, + "learning_rate": 5.158693482818321e-06, + "loss": 0.173, + "step": 2817 + }, + { + "epoch": 8.887490165224232, + "grad_norm": 0.2352594527132892, + "learning_rate": 5.152208798979295e-06, + "loss": 0.1581, + "step": 2818 + }, + { + "epoch": 8.890637293469709, + "grad_norm": 0.23014025030306132, + "learning_rate": 5.145741711991104e-06, + "loss": 0.1674, + "step": 2819 + }, + { + "epoch": 8.893784421715186, + "grad_norm": 0.23167065606455844, + "learning_rate": 5.139292228608378e-06, + "loss": 0.1646, + "step": 2820 + }, + { + "epoch": 8.89693154996066, + "grad_norm": 0.2399892401864134, + "learning_rate": 5.1328603555673375e-06, + "loss": 0.1614, + "step": 2821 + }, + { + "epoch": 8.900078678206137, + "grad_norm": 0.24240253361943384, + "learning_rate": 5.126446099585824e-06, + "loss": 0.1671, + "step": 2822 + }, + { + "epoch": 8.903225806451612, + "grad_norm": 0.23174387029737067, + "learning_rate": 5.120049467363275e-06, + "loss": 0.1625, + "step": 2823 + }, + { + "epoch": 8.906372934697089, + "grad_norm": 0.23155008484418904, + "learning_rate": 5.1136704655807145e-06, + "loss": 0.1689, + "step": 2824 + }, + { + "epoch": 8.909520062942565, + "grad_norm": 0.2340628919114927, + "learning_rate": 5.107309100900762e-06, + "loss": 0.1623, + "step": 2825 + }, + { + "epoch": 8.91266719118804, + "grad_norm": 0.22856119137017192, + "learning_rate": 5.100965379967606e-06, + "loss": 0.1634, + "step": 2826 + }, + { + "epoch": 8.915814319433517, + "grad_norm": 0.238656910481426, + "learning_rate": 5.094639309407021e-06, + "loss": 0.1654, + "step": 2827 + }, + { + "epoch": 8.918961447678992, + "grad_norm": 0.23793602636902197, + "learning_rate": 5.0883308958263255e-06, + "loss": 0.1695, + "step": 2828 + }, + { + "epoch": 8.922108575924469, + "grad_norm": 0.23552997928550543, + "learning_rate": 5.082040145814413e-06, + "loss": 0.1634, + "step": 2829 + }, + { + "epoch": 8.925255704169945, + "grad_norm": 0.2345591111424614, + "learning_rate": 5.075767065941728e-06, + "loss": 0.1712, + "step": 2830 + }, + { + "epoch": 8.92840283241542, + "grad_norm": 0.23815624814766417, + "learning_rate": 5.069511662760245e-06, + "loss": 0.1653, + "step": 2831 + }, + { + "epoch": 8.931549960660897, + "grad_norm": 0.2339825850851606, + "learning_rate": 5.063273942803491e-06, + "loss": 0.1713, + "step": 2832 + }, + { + "epoch": 8.934697088906374, + "grad_norm": 0.23772485751502045, + "learning_rate": 5.057053912586512e-06, + "loss": 0.1616, + "step": 2833 + }, + { + "epoch": 8.937844217151849, + "grad_norm": 0.23732466399568317, + "learning_rate": 5.050851578605892e-06, + "loss": 0.1699, + "step": 2834 + }, + { + "epoch": 8.940991345397325, + "grad_norm": 0.23247387261686783, + "learning_rate": 5.044666947339716e-06, + "loss": 0.1677, + "step": 2835 + }, + { + "epoch": 8.9441384736428, + "grad_norm": 0.22590208575695253, + "learning_rate": 5.038500025247589e-06, + "loss": 0.1676, + "step": 2836 + }, + { + "epoch": 8.947285601888277, + "grad_norm": 0.23597275765287926, + "learning_rate": 5.032350818770616e-06, + "loss": 0.1677, + "step": 2837 + }, + { + "epoch": 8.950432730133754, + "grad_norm": 0.23033651705619715, + "learning_rate": 5.0262193343314e-06, + "loss": 0.1686, + "step": 2838 + }, + { + "epoch": 8.953579858379229, + "grad_norm": 0.2298946051218498, + "learning_rate": 5.020105578334038e-06, + "loss": 0.1693, + "step": 2839 + }, + { + "epoch": 8.956726986624705, + "grad_norm": 0.22515833749826317, + "learning_rate": 5.014009557164099e-06, + "loss": 0.1658, + "step": 2840 + }, + { + "epoch": 8.95987411487018, + "grad_norm": 0.22997911948352323, + "learning_rate": 5.0079312771886425e-06, + "loss": 0.1709, + "step": 2841 + }, + { + "epoch": 8.963021243115657, + "grad_norm": 0.23065097782750338, + "learning_rate": 5.001870744756182e-06, + "loss": 0.1645, + "step": 2842 + }, + { + "epoch": 8.966168371361134, + "grad_norm": 0.23243456881790153, + "learning_rate": 4.995827966196714e-06, + "loss": 0.1715, + "step": 2843 + }, + { + "epoch": 8.969315499606608, + "grad_norm": 0.22702087926588302, + "learning_rate": 4.9898029478216735e-06, + "loss": 0.1656, + "step": 2844 + }, + { + "epoch": 8.972462627852085, + "grad_norm": 0.23667845742674357, + "learning_rate": 4.983795695923958e-06, + "loss": 0.1665, + "step": 2845 + }, + { + "epoch": 8.975609756097562, + "grad_norm": 0.23376278457824037, + "learning_rate": 4.977806216777904e-06, + "loss": 0.1649, + "step": 2846 + }, + { + "epoch": 8.978756884343037, + "grad_norm": 0.22714146711985553, + "learning_rate": 4.971834516639281e-06, + "loss": 0.17, + "step": 2847 + }, + { + "epoch": 8.981904012588513, + "grad_norm": 0.23000407639648165, + "learning_rate": 4.965880601745301e-06, + "loss": 0.1658, + "step": 2848 + }, + { + "epoch": 8.985051140833988, + "grad_norm": 0.23490571123631973, + "learning_rate": 4.959944478314586e-06, + "loss": 0.1637, + "step": 2849 + }, + { + "epoch": 8.988198269079465, + "grad_norm": 0.2357626917407318, + "learning_rate": 4.954026152547187e-06, + "loss": 0.1643, + "step": 2850 + }, + { + "epoch": 8.991345397324942, + "grad_norm": 0.22454926371991393, + "learning_rate": 4.948125630624556e-06, + "loss": 0.1712, + "step": 2851 + }, + { + "epoch": 8.994492525570417, + "grad_norm": 0.22383263709497628, + "learning_rate": 4.9422429187095586e-06, + "loss": 0.1707, + "step": 2852 + }, + { + "epoch": 8.997639653815893, + "grad_norm": 0.23402521859638922, + "learning_rate": 4.936378022946449e-06, + "loss": 0.1627, + "step": 2853 + }, + { + "epoch": 9.003147128245477, + "grad_norm": 0.7259930729062384, + "learning_rate": 4.930530949460883e-06, + "loss": 0.3053, + "step": 2854 + }, + { + "epoch": 9.006294256490952, + "grad_norm": 0.2562706446384713, + "learning_rate": 4.924701704359899e-06, + "loss": 0.1322, + "step": 2855 + }, + { + "epoch": 9.009441384736428, + "grad_norm": 0.2854052200203414, + "learning_rate": 4.918890293731908e-06, + "loss": 0.1333, + "step": 2856 + }, + { + "epoch": 9.012588512981903, + "grad_norm": 0.4691429203587427, + "learning_rate": 4.9130967236467026e-06, + "loss": 0.1374, + "step": 2857 + }, + { + "epoch": 9.01573564122738, + "grad_norm": 0.4028184182051083, + "learning_rate": 4.907321000155432e-06, + "loss": 0.1364, + "step": 2858 + }, + { + "epoch": 9.018882769472857, + "grad_norm": 0.27399671132513376, + "learning_rate": 4.901563129290619e-06, + "loss": 0.137, + "step": 2859 + }, + { + "epoch": 9.022029897718332, + "grad_norm": 0.3360542252051378, + "learning_rate": 4.895823117066122e-06, + "loss": 0.1385, + "step": 2860 + }, + { + "epoch": 9.025177025963808, + "grad_norm": 0.3362833473207967, + "learning_rate": 4.890100969477159e-06, + "loss": 0.1308, + "step": 2861 + }, + { + "epoch": 9.028324154209285, + "grad_norm": 0.280838008338128, + "learning_rate": 4.884396692500293e-06, + "loss": 0.1274, + "step": 2862 + }, + { + "epoch": 9.03147128245476, + "grad_norm": 0.27276504949498936, + "learning_rate": 4.878710292093409e-06, + "loss": 0.1293, + "step": 2863 + }, + { + "epoch": 9.034618410700237, + "grad_norm": 0.30411776357879683, + "learning_rate": 4.8730417741957306e-06, + "loss": 0.1315, + "step": 2864 + }, + { + "epoch": 9.037765538945711, + "grad_norm": 0.3071835435610242, + "learning_rate": 4.867391144727798e-06, + "loss": 0.1292, + "step": 2865 + }, + { + "epoch": 9.040912667191188, + "grad_norm": 0.2774412987717509, + "learning_rate": 4.861758409591474e-06, + "loss": 0.1352, + "step": 2866 + }, + { + "epoch": 9.044059795436665, + "grad_norm": 0.24821320889899118, + "learning_rate": 4.8561435746699224e-06, + "loss": 0.132, + "step": 2867 + }, + { + "epoch": 9.04720692368214, + "grad_norm": 0.260501036335888, + "learning_rate": 4.85054664582762e-06, + "loss": 0.1298, + "step": 2868 + }, + { + "epoch": 9.050354051927616, + "grad_norm": 0.25950354132313963, + "learning_rate": 4.844967628910332e-06, + "loss": 0.1382, + "step": 2869 + }, + { + "epoch": 9.053501180173091, + "grad_norm": 0.254911463794064, + "learning_rate": 4.839406529745122e-06, + "loss": 0.1331, + "step": 2870 + }, + { + "epoch": 9.056648308418568, + "grad_norm": 0.2568698001440897, + "learning_rate": 4.833863354140345e-06, + "loss": 0.1313, + "step": 2871 + }, + { + "epoch": 9.059795436664045, + "grad_norm": 0.25834630400276143, + "learning_rate": 4.828338107885621e-06, + "loss": 0.1339, + "step": 2872 + }, + { + "epoch": 9.06294256490952, + "grad_norm": 0.2626115567381949, + "learning_rate": 4.822830796751856e-06, + "loss": 0.1398, + "step": 2873 + }, + { + "epoch": 9.066089693154996, + "grad_norm": 0.23847349899918344, + "learning_rate": 4.817341426491213e-06, + "loss": 0.1304, + "step": 2874 + }, + { + "epoch": 9.069236821400471, + "grad_norm": 0.25139369715829973, + "learning_rate": 4.811870002837126e-06, + "loss": 0.1309, + "step": 2875 + }, + { + "epoch": 9.072383949645948, + "grad_norm": 0.2453860537308011, + "learning_rate": 4.806416531504274e-06, + "loss": 0.135, + "step": 2876 + }, + { + "epoch": 9.075531077891425, + "grad_norm": 0.25404710592240104, + "learning_rate": 4.800981018188602e-06, + "loss": 0.1266, + "step": 2877 + }, + { + "epoch": 9.0786782061369, + "grad_norm": 0.22754008276241747, + "learning_rate": 4.79556346856728e-06, + "loss": 0.1357, + "step": 2878 + }, + { + "epoch": 9.081825334382376, + "grad_norm": 0.25168887931836287, + "learning_rate": 4.79016388829873e-06, + "loss": 0.1341, + "step": 2879 + }, + { + "epoch": 9.084972462627853, + "grad_norm": 0.2502383435030883, + "learning_rate": 4.784782283022597e-06, + "loss": 0.1352, + "step": 2880 + }, + { + "epoch": 9.088119590873328, + "grad_norm": 0.25986754177545734, + "learning_rate": 4.7794186583597544e-06, + "loss": 0.132, + "step": 2881 + }, + { + "epoch": 9.091266719118805, + "grad_norm": 0.22662605665615443, + "learning_rate": 4.774073019912298e-06, + "loss": 0.1366, + "step": 2882 + }, + { + "epoch": 9.09441384736428, + "grad_norm": 0.2304079612052606, + "learning_rate": 4.7687453732635305e-06, + "loss": 0.131, + "step": 2883 + }, + { + "epoch": 9.097560975609756, + "grad_norm": 0.23596467115393177, + "learning_rate": 4.763435723977974e-06, + "loss": 0.1311, + "step": 2884 + }, + { + "epoch": 9.100708103855233, + "grad_norm": 0.23983207466582687, + "learning_rate": 4.7581440776013425e-06, + "loss": 0.1295, + "step": 2885 + }, + { + "epoch": 9.103855232100708, + "grad_norm": 0.22975847317016954, + "learning_rate": 4.752870439660551e-06, + "loss": 0.1321, + "step": 2886 + }, + { + "epoch": 9.107002360346184, + "grad_norm": 0.23955053071281487, + "learning_rate": 4.747614815663711e-06, + "loss": 0.1355, + "step": 2887 + }, + { + "epoch": 9.11014948859166, + "grad_norm": 0.2342749597252554, + "learning_rate": 4.742377211100105e-06, + "loss": 0.1302, + "step": 2888 + }, + { + "epoch": 9.113296616837136, + "grad_norm": 0.24153308674358212, + "learning_rate": 4.7371576314402135e-06, + "loss": 0.1264, + "step": 2889 + }, + { + "epoch": 9.116443745082613, + "grad_norm": 0.2321425269869848, + "learning_rate": 4.731956082135669e-06, + "loss": 0.1268, + "step": 2890 + }, + { + "epoch": 9.119590873328088, + "grad_norm": 0.23212954304734737, + "learning_rate": 4.726772568619297e-06, + "loss": 0.1325, + "step": 2891 + }, + { + "epoch": 9.122738001573564, + "grad_norm": 0.23275541822697182, + "learning_rate": 4.721607096305063e-06, + "loss": 0.1239, + "step": 2892 + }, + { + "epoch": 9.12588512981904, + "grad_norm": 0.23593940968545418, + "learning_rate": 4.716459670588102e-06, + "loss": 0.1332, + "step": 2893 + }, + { + "epoch": 9.129032258064516, + "grad_norm": 0.24159409253703298, + "learning_rate": 4.711330296844695e-06, + "loss": 0.1337, + "step": 2894 + }, + { + "epoch": 9.132179386309993, + "grad_norm": 0.225298246971766, + "learning_rate": 4.706218980432269e-06, + "loss": 0.1332, + "step": 2895 + }, + { + "epoch": 9.135326514555468, + "grad_norm": 0.23345137024396634, + "learning_rate": 4.701125726689394e-06, + "loss": 0.1289, + "step": 2896 + }, + { + "epoch": 9.138473642800944, + "grad_norm": 0.22765284284624834, + "learning_rate": 4.69605054093577e-06, + "loss": 0.1332, + "step": 2897 + }, + { + "epoch": 9.141620771046421, + "grad_norm": 0.2365079564611352, + "learning_rate": 4.690993428472231e-06, + "loss": 0.1353, + "step": 2898 + }, + { + "epoch": 9.144767899291896, + "grad_norm": 0.22894145213619913, + "learning_rate": 4.685954394580723e-06, + "loss": 0.1316, + "step": 2899 + }, + { + "epoch": 9.147915027537373, + "grad_norm": 0.2316844787210736, + "learning_rate": 4.680933444524327e-06, + "loss": 0.1319, + "step": 2900 + }, + { + "epoch": 9.151062155782848, + "grad_norm": 0.2295088666150164, + "learning_rate": 4.675930583547219e-06, + "loss": 0.1352, + "step": 2901 + }, + { + "epoch": 9.154209284028324, + "grad_norm": 0.22737542176781833, + "learning_rate": 4.670945816874691e-06, + "loss": 0.1362, + "step": 2902 + }, + { + "epoch": 9.157356412273801, + "grad_norm": 0.23483198051992082, + "learning_rate": 4.66597914971314e-06, + "loss": 0.123, + "step": 2903 + }, + { + "epoch": 9.160503540519276, + "grad_norm": 0.23493971809693678, + "learning_rate": 4.661030587250045e-06, + "loss": 0.1345, + "step": 2904 + }, + { + "epoch": 9.163650668764753, + "grad_norm": 0.23898547282251156, + "learning_rate": 4.656100134653988e-06, + "loss": 0.1289, + "step": 2905 + }, + { + "epoch": 9.166797797010227, + "grad_norm": 0.22591766013667208, + "learning_rate": 4.65118779707463e-06, + "loss": 0.1365, + "step": 2906 + }, + { + "epoch": 9.169944925255704, + "grad_norm": 0.22651931489444632, + "learning_rate": 4.646293579642716e-06, + "loss": 0.1372, + "step": 2907 + }, + { + "epoch": 9.17309205350118, + "grad_norm": 0.22780380035435008, + "learning_rate": 4.641417487470058e-06, + "loss": 0.135, + "step": 2908 + }, + { + "epoch": 9.176239181746656, + "grad_norm": 0.23276681424595513, + "learning_rate": 4.636559525649546e-06, + "loss": 0.1362, + "step": 2909 + }, + { + "epoch": 9.179386309992132, + "grad_norm": 0.23869234341896103, + "learning_rate": 4.631719699255123e-06, + "loss": 0.1352, + "step": 2910 + }, + { + "epoch": 9.182533438237607, + "grad_norm": 0.23080606084113706, + "learning_rate": 4.626898013341801e-06, + "loss": 0.1347, + "step": 2911 + }, + { + "epoch": 9.185680566483084, + "grad_norm": 0.23836084114678538, + "learning_rate": 4.622094472945639e-06, + "loss": 0.1246, + "step": 2912 + }, + { + "epoch": 9.18882769472856, + "grad_norm": 0.2404240676609406, + "learning_rate": 4.6173090830837434e-06, + "loss": 0.1325, + "step": 2913 + }, + { + "epoch": 9.191974822974036, + "grad_norm": 0.2123536854744844, + "learning_rate": 4.612541848754265e-06, + "loss": 0.1355, + "step": 2914 + }, + { + "epoch": 9.195121951219512, + "grad_norm": 0.23581558553764673, + "learning_rate": 4.60779277493639e-06, + "loss": 0.1314, + "step": 2915 + }, + { + "epoch": 9.198269079464989, + "grad_norm": 0.23688342974144003, + "learning_rate": 4.6030618665903425e-06, + "loss": 0.1317, + "step": 2916 + }, + { + "epoch": 9.201416207710464, + "grad_norm": 0.22422105201679832, + "learning_rate": 4.598349128657362e-06, + "loss": 0.1276, + "step": 2917 + }, + { + "epoch": 9.20456333595594, + "grad_norm": 0.22578943792556588, + "learning_rate": 4.593654566059721e-06, + "loss": 0.1339, + "step": 2918 + }, + { + "epoch": 9.207710464201416, + "grad_norm": 0.24065645526761678, + "learning_rate": 4.588978183700705e-06, + "loss": 0.1265, + "step": 2919 + }, + { + "epoch": 9.210857592446892, + "grad_norm": 0.22356993270115313, + "learning_rate": 4.584319986464608e-06, + "loss": 0.1282, + "step": 2920 + }, + { + "epoch": 9.214004720692369, + "grad_norm": 0.22840256425011418, + "learning_rate": 4.579679979216736e-06, + "loss": 0.1354, + "step": 2921 + }, + { + "epoch": 9.217151848937844, + "grad_norm": 0.22189263858930341, + "learning_rate": 4.575058166803388e-06, + "loss": 0.1292, + "step": 2922 + }, + { + "epoch": 9.22029897718332, + "grad_norm": 0.2373676925550199, + "learning_rate": 4.570454554051869e-06, + "loss": 0.1308, + "step": 2923 + }, + { + "epoch": 9.223446105428796, + "grad_norm": 0.23466096466526548, + "learning_rate": 4.565869145770464e-06, + "loss": 0.1307, + "step": 2924 + }, + { + "epoch": 9.226593233674272, + "grad_norm": 0.22341580256810417, + "learning_rate": 4.561301946748457e-06, + "loss": 0.1356, + "step": 2925 + }, + { + "epoch": 9.229740361919749, + "grad_norm": 0.2337076552757127, + "learning_rate": 4.5567529617561015e-06, + "loss": 0.1351, + "step": 2926 + }, + { + "epoch": 9.232887490165224, + "grad_norm": 0.22744541315055633, + "learning_rate": 4.552222195544636e-06, + "loss": 0.1312, + "step": 2927 + }, + { + "epoch": 9.2360346184107, + "grad_norm": 0.23338172402658214, + "learning_rate": 4.547709652846264e-06, + "loss": 0.1284, + "step": 2928 + }, + { + "epoch": 9.239181746656175, + "grad_norm": 0.23508676122090486, + "learning_rate": 4.543215338374159e-06, + "loss": 0.1317, + "step": 2929 + }, + { + "epoch": 9.242328874901652, + "grad_norm": 0.24601704848722142, + "learning_rate": 4.538739256822453e-06, + "loss": 0.1338, + "step": 2930 + }, + { + "epoch": 9.245476003147129, + "grad_norm": 0.22389912494058156, + "learning_rate": 4.5342814128662376e-06, + "loss": 0.1316, + "step": 2931 + }, + { + "epoch": 9.248623131392604, + "grad_norm": 0.2263022423534222, + "learning_rate": 4.529841811161555e-06, + "loss": 0.1321, + "step": 2932 + }, + { + "epoch": 9.25177025963808, + "grad_norm": 0.23229249077068176, + "learning_rate": 4.5254204563453866e-06, + "loss": 0.1347, + "step": 2933 + }, + { + "epoch": 9.254917387883557, + "grad_norm": 0.2310347838411131, + "learning_rate": 4.521017353035675e-06, + "loss": 0.131, + "step": 2934 + }, + { + "epoch": 9.258064516129032, + "grad_norm": 0.23080904694053458, + "learning_rate": 4.5166325058312745e-06, + "loss": 0.1358, + "step": 2935 + }, + { + "epoch": 9.261211644374509, + "grad_norm": 0.22383054119713772, + "learning_rate": 4.512265919311992e-06, + "loss": 0.1348, + "step": 2936 + }, + { + "epoch": 9.264358772619984, + "grad_norm": 0.23282253286973853, + "learning_rate": 4.5079175980385546e-06, + "loss": 0.1291, + "step": 2937 + }, + { + "epoch": 9.26750590086546, + "grad_norm": 0.22462772007805537, + "learning_rate": 4.503587546552607e-06, + "loss": 0.1326, + "step": 2938 + }, + { + "epoch": 9.270653029110937, + "grad_norm": 0.2404642052007977, + "learning_rate": 4.49927576937672e-06, + "loss": 0.1353, + "step": 2939 + }, + { + "epoch": 9.273800157356412, + "grad_norm": 0.22056179286469124, + "learning_rate": 4.494982271014371e-06, + "loss": 0.1327, + "step": 2940 + }, + { + "epoch": 9.276947285601889, + "grad_norm": 0.2366854429506929, + "learning_rate": 4.490707055949954e-06, + "loss": 0.1324, + "step": 2941 + }, + { + "epoch": 9.280094413847364, + "grad_norm": 0.2334841351792365, + "learning_rate": 4.4864501286487574e-06, + "loss": 0.1303, + "step": 2942 + }, + { + "epoch": 9.28324154209284, + "grad_norm": 0.2411585813043938, + "learning_rate": 4.482211493556974e-06, + "loss": 0.1346, + "step": 2943 + }, + { + "epoch": 9.286388670338317, + "grad_norm": 0.21989262346519506, + "learning_rate": 4.4779911551016934e-06, + "loss": 0.1316, + "step": 2944 + }, + { + "epoch": 9.289535798583792, + "grad_norm": 0.22995367913581893, + "learning_rate": 4.473789117690887e-06, + "loss": 0.1277, + "step": 2945 + }, + { + "epoch": 9.292682926829269, + "grad_norm": 0.22756421143574948, + "learning_rate": 4.469605385713421e-06, + "loss": 0.1351, + "step": 2946 + }, + { + "epoch": 9.295830055074743, + "grad_norm": 0.23592901330387175, + "learning_rate": 4.465439963539034e-06, + "loss": 0.1289, + "step": 2947 + }, + { + "epoch": 9.29897718332022, + "grad_norm": 0.22539025532476353, + "learning_rate": 4.4612928555183486e-06, + "loss": 0.1348, + "step": 2948 + }, + { + "epoch": 9.302124311565697, + "grad_norm": 0.22791556449603953, + "learning_rate": 4.45716406598285e-06, + "loss": 0.1372, + "step": 2949 + }, + { + "epoch": 9.305271439811172, + "grad_norm": 0.23492214436922185, + "learning_rate": 4.453053599244903e-06, + "loss": 0.1378, + "step": 2950 + }, + { + "epoch": 9.308418568056648, + "grad_norm": 0.2245331885362177, + "learning_rate": 4.448961459597719e-06, + "loss": 0.1334, + "step": 2951 + }, + { + "epoch": 9.311565696302125, + "grad_norm": 0.22809869975483035, + "learning_rate": 4.444887651315381e-06, + "loss": 0.1296, + "step": 2952 + }, + { + "epoch": 9.3147128245476, + "grad_norm": 0.2324760160461881, + "learning_rate": 4.440832178652819e-06, + "loss": 0.1334, + "step": 2953 + }, + { + "epoch": 9.317859952793077, + "grad_norm": 0.2446173081810758, + "learning_rate": 4.436795045845812e-06, + "loss": 0.1313, + "step": 2954 + }, + { + "epoch": 9.321007081038552, + "grad_norm": 0.2320500946229002, + "learning_rate": 4.432776257110989e-06, + "loss": 0.1356, + "step": 2955 + }, + { + "epoch": 9.324154209284028, + "grad_norm": 0.2448068713490391, + "learning_rate": 4.428775816645813e-06, + "loss": 0.1329, + "step": 2956 + }, + { + "epoch": 9.327301337529505, + "grad_norm": 0.22982431355817806, + "learning_rate": 4.424793728628586e-06, + "loss": 0.134, + "step": 2957 + }, + { + "epoch": 9.33044846577498, + "grad_norm": 0.22806790138954225, + "learning_rate": 4.420829997218441e-06, + "loss": 0.1362, + "step": 2958 + }, + { + "epoch": 9.333595594020457, + "grad_norm": 0.23824485969121148, + "learning_rate": 4.416884626555339e-06, + "loss": 0.1308, + "step": 2959 + }, + { + "epoch": 9.336742722265932, + "grad_norm": 0.23917326850947188, + "learning_rate": 4.412957620760065e-06, + "loss": 0.1235, + "step": 2960 + }, + { + "epoch": 9.339889850511408, + "grad_norm": 0.23389654883715155, + "learning_rate": 4.409048983934219e-06, + "loss": 0.1384, + "step": 2961 + }, + { + "epoch": 9.343036978756885, + "grad_norm": 0.2410002101605033, + "learning_rate": 4.405158720160217e-06, + "loss": 0.1391, + "step": 2962 + }, + { + "epoch": 9.34618410700236, + "grad_norm": 0.2463484092587633, + "learning_rate": 4.4012868335012865e-06, + "loss": 0.1313, + "step": 2963 + }, + { + "epoch": 9.349331235247837, + "grad_norm": 0.23216314413671288, + "learning_rate": 4.3974333280014605e-06, + "loss": 0.1321, + "step": 2964 + }, + { + "epoch": 9.352478363493312, + "grad_norm": 0.25795328112436694, + "learning_rate": 4.393598207685572e-06, + "loss": 0.134, + "step": 2965 + }, + { + "epoch": 9.355625491738788, + "grad_norm": 0.22300360535493108, + "learning_rate": 4.389781476559255e-06, + "loss": 0.1358, + "step": 2966 + }, + { + "epoch": 9.358772619984265, + "grad_norm": 0.2361759272681179, + "learning_rate": 4.385983138608928e-06, + "loss": 0.1355, + "step": 2967 + }, + { + "epoch": 9.36191974822974, + "grad_norm": 0.2393199934322128, + "learning_rate": 4.38220319780181e-06, + "loss": 0.1323, + "step": 2968 + }, + { + "epoch": 9.365066876475217, + "grad_norm": 0.2357587424561091, + "learning_rate": 4.378441658085899e-06, + "loss": 0.1358, + "step": 2969 + }, + { + "epoch": 9.368214004720693, + "grad_norm": 0.24506755276712727, + "learning_rate": 4.374698523389971e-06, + "loss": 0.1251, + "step": 2970 + }, + { + "epoch": 9.371361132966168, + "grad_norm": 0.23734868049410257, + "learning_rate": 4.370973797623585e-06, + "loss": 0.1355, + "step": 2971 + }, + { + "epoch": 9.374508261211645, + "grad_norm": 0.23479098741562127, + "learning_rate": 4.367267484677067e-06, + "loss": 0.1332, + "step": 2972 + }, + { + "epoch": 9.37765538945712, + "grad_norm": 0.2301962617124699, + "learning_rate": 4.363579588421517e-06, + "loss": 0.1282, + "step": 2973 + }, + { + "epoch": 9.380802517702596, + "grad_norm": 0.24990595732884477, + "learning_rate": 4.3599101127087944e-06, + "loss": 0.1287, + "step": 2974 + }, + { + "epoch": 9.383949645948073, + "grad_norm": 0.24405837184477827, + "learning_rate": 4.356259061371524e-06, + "loss": 0.1322, + "step": 2975 + }, + { + "epoch": 9.387096774193548, + "grad_norm": 0.23354503269275637, + "learning_rate": 4.3526264382230806e-06, + "loss": 0.1301, + "step": 2976 + }, + { + "epoch": 9.390243902439025, + "grad_norm": 0.22497899824756662, + "learning_rate": 4.349012247057597e-06, + "loss": 0.1341, + "step": 2977 + }, + { + "epoch": 9.3933910306845, + "grad_norm": 0.233509003106069, + "learning_rate": 4.345416491649954e-06, + "loss": 0.1291, + "step": 2978 + }, + { + "epoch": 9.396538158929976, + "grad_norm": 0.23591177828002854, + "learning_rate": 4.3418391757557745e-06, + "loss": 0.1311, + "step": 2979 + }, + { + "epoch": 9.399685287175453, + "grad_norm": 0.22958226547889216, + "learning_rate": 4.338280303111426e-06, + "loss": 0.1321, + "step": 2980 + }, + { + "epoch": 9.402832415420928, + "grad_norm": 0.24045468326756445, + "learning_rate": 4.334739877434006e-06, + "loss": 0.1326, + "step": 2981 + }, + { + "epoch": 9.405979543666405, + "grad_norm": 0.24332656727987065, + "learning_rate": 4.33121790242135e-06, + "loss": 0.1353, + "step": 2982 + }, + { + "epoch": 9.40912667191188, + "grad_norm": 0.2318763713593753, + "learning_rate": 4.327714381752023e-06, + "loss": 0.1309, + "step": 2983 + }, + { + "epoch": 9.412273800157356, + "grad_norm": 0.23460111972050288, + "learning_rate": 4.32422931908531e-06, + "loss": 0.1306, + "step": 2984 + }, + { + "epoch": 9.415420928402833, + "grad_norm": 0.23033494597201723, + "learning_rate": 4.320762718061228e-06, + "loss": 0.1341, + "step": 2985 + }, + { + "epoch": 9.418568056648308, + "grad_norm": 0.24542818156662408, + "learning_rate": 4.317314582300496e-06, + "loss": 0.1324, + "step": 2986 + }, + { + "epoch": 9.421715184893785, + "grad_norm": 0.2363958350642206, + "learning_rate": 4.313884915404562e-06, + "loss": 0.1346, + "step": 2987 + }, + { + "epoch": 9.424862313139261, + "grad_norm": 0.24392888891052444, + "learning_rate": 4.3104737209555735e-06, + "loss": 0.1293, + "step": 2988 + }, + { + "epoch": 9.428009441384736, + "grad_norm": 0.2270478248172354, + "learning_rate": 4.30708100251639e-06, + "loss": 0.1342, + "step": 2989 + }, + { + "epoch": 9.431156569630213, + "grad_norm": 0.2419755010247926, + "learning_rate": 4.3037067636305695e-06, + "loss": 0.1361, + "step": 2990 + }, + { + "epoch": 9.434303697875688, + "grad_norm": 0.24302820086496213, + "learning_rate": 4.3003510078223735e-06, + "loss": 0.1357, + "step": 2991 + }, + { + "epoch": 9.437450826121164, + "grad_norm": 0.22889930071492856, + "learning_rate": 4.297013738596754e-06, + "loss": 0.1326, + "step": 2992 + }, + { + "epoch": 9.440597954366641, + "grad_norm": 0.2470105830931397, + "learning_rate": 4.293694959439357e-06, + "loss": 0.1307, + "step": 2993 + }, + { + "epoch": 9.443745082612116, + "grad_norm": 0.24509387802011043, + "learning_rate": 4.290394673816518e-06, + "loss": 0.1351, + "step": 2994 + }, + { + "epoch": 9.446892210857593, + "grad_norm": 0.24182787680185575, + "learning_rate": 4.287112885175252e-06, + "loss": 0.1392, + "step": 2995 + }, + { + "epoch": 9.450039339103068, + "grad_norm": 0.2467508126477943, + "learning_rate": 4.283849596943258e-06, + "loss": 0.1263, + "step": 2996 + }, + { + "epoch": 9.453186467348544, + "grad_norm": 0.2316135678434177, + "learning_rate": 4.280604812528912e-06, + "loss": 0.1324, + "step": 2997 + }, + { + "epoch": 9.456333595594021, + "grad_norm": 0.2268465070914208, + "learning_rate": 4.277378535321262e-06, + "loss": 0.1328, + "step": 2998 + }, + { + "epoch": 9.459480723839496, + "grad_norm": 0.2332807858816893, + "learning_rate": 4.274170768690028e-06, + "loss": 0.1373, + "step": 2999 + }, + { + "epoch": 9.462627852084973, + "grad_norm": 0.22900858658201778, + "learning_rate": 4.270981515985594e-06, + "loss": 0.1329, + "step": 3000 + }, + { + "epoch": 9.465774980330448, + "grad_norm": 0.2369332302133477, + "learning_rate": 4.26781078053901e-06, + "loss": 0.1314, + "step": 3001 + }, + { + "epoch": 9.468922108575924, + "grad_norm": 0.24455011847922023, + "learning_rate": 4.264658565661981e-06, + "loss": 0.1285, + "step": 3002 + }, + { + "epoch": 9.472069236821401, + "grad_norm": 0.239435503067824, + "learning_rate": 4.261524874646873e-06, + "loss": 0.1332, + "step": 3003 + }, + { + "epoch": 9.475216365066876, + "grad_norm": 0.2492226908546605, + "learning_rate": 4.258409710766699e-06, + "loss": 0.1278, + "step": 3004 + }, + { + "epoch": 9.478363493312353, + "grad_norm": 0.23444315425803552, + "learning_rate": 4.255313077275127e-06, + "loss": 0.1376, + "step": 3005 + }, + { + "epoch": 9.48151062155783, + "grad_norm": 0.23211757532136212, + "learning_rate": 4.252234977406469e-06, + "loss": 0.1327, + "step": 3006 + }, + { + "epoch": 9.484657749803304, + "grad_norm": 0.2313035945554957, + "learning_rate": 4.249175414375676e-06, + "loss": 0.1335, + "step": 3007 + }, + { + "epoch": 9.487804878048781, + "grad_norm": 0.24305332604029584, + "learning_rate": 4.246134391378343e-06, + "loss": 0.1288, + "step": 3008 + }, + { + "epoch": 9.490952006294256, + "grad_norm": 0.2416576523834277, + "learning_rate": 4.243111911590694e-06, + "loss": 0.1346, + "step": 3009 + }, + { + "epoch": 9.494099134539733, + "grad_norm": 0.23223887323478282, + "learning_rate": 4.240107978169594e-06, + "loss": 0.1357, + "step": 3010 + }, + { + "epoch": 9.49724626278521, + "grad_norm": 0.23335311574377063, + "learning_rate": 4.23712259425253e-06, + "loss": 0.1336, + "step": 3011 + }, + { + "epoch": 9.500393391030684, + "grad_norm": 0.23028601506740967, + "learning_rate": 4.234155762957619e-06, + "loss": 0.1367, + "step": 3012 + }, + { + "epoch": 9.50354051927616, + "grad_norm": 0.2446292329288432, + "learning_rate": 4.231207487383596e-06, + "loss": 0.1363, + "step": 3013 + }, + { + "epoch": 9.506687647521636, + "grad_norm": 0.24503357102458362, + "learning_rate": 4.228277770609821e-06, + "loss": 0.1386, + "step": 3014 + }, + { + "epoch": 9.509834775767112, + "grad_norm": 0.2445600531818062, + "learning_rate": 4.225366615696263e-06, + "loss": 0.1369, + "step": 3015 + }, + { + "epoch": 9.51298190401259, + "grad_norm": 0.23791913882537596, + "learning_rate": 4.222474025683514e-06, + "loss": 0.1346, + "step": 3016 + }, + { + "epoch": 9.516129032258064, + "grad_norm": 0.2429591101645711, + "learning_rate": 4.219600003592767e-06, + "loss": 0.1307, + "step": 3017 + }, + { + "epoch": 9.51927616050354, + "grad_norm": 0.3968467791341316, + "learning_rate": 4.2167445524258226e-06, + "loss": 0.1379, + "step": 3018 + }, + { + "epoch": 9.522423288749017, + "grad_norm": 0.23354286684086903, + "learning_rate": 4.213907675165086e-06, + "loss": 0.1312, + "step": 3019 + }, + { + "epoch": 9.525570416994492, + "grad_norm": 0.23749531424478923, + "learning_rate": 4.2110893747735655e-06, + "loss": 0.1308, + "step": 3020 + }, + { + "epoch": 9.528717545239969, + "grad_norm": 0.24147254873584723, + "learning_rate": 4.2082896541948675e-06, + "loss": 0.1374, + "step": 3021 + }, + { + "epoch": 9.531864673485444, + "grad_norm": 0.252926435908387, + "learning_rate": 4.205508516353183e-06, + "loss": 0.139, + "step": 3022 + }, + { + "epoch": 9.53501180173092, + "grad_norm": 0.24123310444093413, + "learning_rate": 4.202745964153305e-06, + "loss": 0.1296, + "step": 3023 + }, + { + "epoch": 9.538158929976397, + "grad_norm": 0.22888213642637095, + "learning_rate": 4.200002000480605e-06, + "loss": 0.1333, + "step": 3024 + }, + { + "epoch": 9.541306058221872, + "grad_norm": 0.23412443013585427, + "learning_rate": 4.197276628201048e-06, + "loss": 0.1357, + "step": 3025 + }, + { + "epoch": 9.544453186467349, + "grad_norm": 0.2506322070152408, + "learning_rate": 4.194569850161179e-06, + "loss": 0.1351, + "step": 3026 + }, + { + "epoch": 9.547600314712824, + "grad_norm": 0.2314307114246427, + "learning_rate": 4.191881669188117e-06, + "loss": 0.1377, + "step": 3027 + }, + { + "epoch": 9.5507474429583, + "grad_norm": 0.2383274553063989, + "learning_rate": 4.1892120880895605e-06, + "loss": 0.1333, + "step": 3028 + }, + { + "epoch": 9.553894571203777, + "grad_norm": 0.23157419286047795, + "learning_rate": 4.186561109653784e-06, + "loss": 0.1401, + "step": 3029 + }, + { + "epoch": 9.557041699449252, + "grad_norm": 0.2454714122459766, + "learning_rate": 4.1839287366496285e-06, + "loss": 0.1351, + "step": 3030 + }, + { + "epoch": 9.560188827694729, + "grad_norm": 0.23213441228577894, + "learning_rate": 4.181314971826502e-06, + "loss": 0.1349, + "step": 3031 + }, + { + "epoch": 9.563335955940204, + "grad_norm": 0.2383713805883692, + "learning_rate": 4.178719817914378e-06, + "loss": 0.1322, + "step": 3032 + }, + { + "epoch": 9.56648308418568, + "grad_norm": 0.23809296540852945, + "learning_rate": 4.176143277623796e-06, + "loss": 0.1236, + "step": 3033 + }, + { + "epoch": 9.569630212431157, + "grad_norm": 0.2452859142596891, + "learning_rate": 4.1735853536458455e-06, + "loss": 0.1334, + "step": 3034 + }, + { + "epoch": 9.572777340676632, + "grad_norm": 0.2528048789801201, + "learning_rate": 4.1710460486521795e-06, + "loss": 0.1345, + "step": 3035 + }, + { + "epoch": 9.575924468922109, + "grad_norm": 0.24602308858149582, + "learning_rate": 4.168525365295002e-06, + "loss": 0.1352, + "step": 3036 + }, + { + "epoch": 9.579071597167584, + "grad_norm": 0.24143788400432517, + "learning_rate": 4.166023306207066e-06, + "loss": 0.1344, + "step": 3037 + }, + { + "epoch": 9.58221872541306, + "grad_norm": 0.24049001092299743, + "learning_rate": 4.163539874001671e-06, + "loss": 0.1361, + "step": 3038 + }, + { + "epoch": 9.585365853658537, + "grad_norm": 0.23227717157473632, + "learning_rate": 4.161075071272668e-06, + "loss": 0.1305, + "step": 3039 + }, + { + "epoch": 9.588512981904012, + "grad_norm": 0.2350438398401355, + "learning_rate": 4.158628900594442e-06, + "loss": 0.1313, + "step": 3040 + }, + { + "epoch": 9.591660110149489, + "grad_norm": 0.2374230007231774, + "learning_rate": 4.156201364521924e-06, + "loss": 0.1356, + "step": 3041 + }, + { + "epoch": 9.594807238394965, + "grad_norm": 0.23945340129922146, + "learning_rate": 4.1537924655905785e-06, + "loss": 0.137, + "step": 3042 + }, + { + "epoch": 9.59795436664044, + "grad_norm": 0.23799731806867103, + "learning_rate": 4.151402206316405e-06, + "loss": 0.1294, + "step": 3043 + }, + { + "epoch": 9.601101494885917, + "grad_norm": 0.2361952391608318, + "learning_rate": 4.1490305891959334e-06, + "loss": 0.1312, + "step": 3044 + }, + { + "epoch": 9.604248623131392, + "grad_norm": 0.24879673775767014, + "learning_rate": 4.146677616706226e-06, + "loss": 0.1305, + "step": 3045 + }, + { + "epoch": 9.607395751376869, + "grad_norm": 0.23579330579462654, + "learning_rate": 4.144343291304867e-06, + "loss": 0.1342, + "step": 3046 + }, + { + "epoch": 9.610542879622345, + "grad_norm": 0.2293828810426797, + "learning_rate": 4.14202761542997e-06, + "loss": 0.1397, + "step": 3047 + }, + { + "epoch": 9.61369000786782, + "grad_norm": 0.22998664945318115, + "learning_rate": 4.139730591500165e-06, + "loss": 0.1343, + "step": 3048 + }, + { + "epoch": 9.616837136113297, + "grad_norm": 0.22872577120241833, + "learning_rate": 4.137452221914602e-06, + "loss": 0.1315, + "step": 3049 + }, + { + "epoch": 9.619984264358772, + "grad_norm": 0.25033210170902204, + "learning_rate": 4.135192509052947e-06, + "loss": 0.1324, + "step": 3050 + }, + { + "epoch": 9.623131392604249, + "grad_norm": 0.23939961130535323, + "learning_rate": 4.132951455275385e-06, + "loss": 0.1347, + "step": 3051 + }, + { + "epoch": 9.626278520849725, + "grad_norm": 0.24037659462353853, + "learning_rate": 4.130729062922602e-06, + "loss": 0.1323, + "step": 3052 + }, + { + "epoch": 9.6294256490952, + "grad_norm": 0.22674798888307743, + "learning_rate": 4.1285253343158045e-06, + "loss": 0.1418, + "step": 3053 + }, + { + "epoch": 9.632572777340677, + "grad_norm": 0.24399535601791178, + "learning_rate": 4.126340271756696e-06, + "loss": 0.1345, + "step": 3054 + }, + { + "epoch": 9.635719905586154, + "grad_norm": 0.2380467339574685, + "learning_rate": 4.1241738775274875e-06, + "loss": 0.1322, + "step": 3055 + }, + { + "epoch": 9.638867033831628, + "grad_norm": 0.2291006675130497, + "learning_rate": 4.122026153890896e-06, + "loss": 0.1365, + "step": 3056 + }, + { + "epoch": 9.642014162077105, + "grad_norm": 0.23764446572668885, + "learning_rate": 4.119897103090129e-06, + "loss": 0.1362, + "step": 3057 + }, + { + "epoch": 9.64516129032258, + "grad_norm": 0.24054188658200384, + "learning_rate": 4.117786727348898e-06, + "loss": 0.135, + "step": 3058 + }, + { + "epoch": 9.648308418568057, + "grad_norm": 0.22923630142098805, + "learning_rate": 4.1156950288714084e-06, + "loss": 0.1376, + "step": 3059 + }, + { + "epoch": 9.651455546813533, + "grad_norm": 0.23674042582246743, + "learning_rate": 4.113622009842354e-06, + "loss": 0.138, + "step": 3060 + }, + { + "epoch": 9.654602675059008, + "grad_norm": 0.23241455559511415, + "learning_rate": 4.111567672426922e-06, + "loss": 0.1394, + "step": 3061 + }, + { + "epoch": 9.657749803304485, + "grad_norm": 0.22726828049867015, + "learning_rate": 4.109532018770787e-06, + "loss": 0.1299, + "step": 3062 + }, + { + "epoch": 9.66089693154996, + "grad_norm": 0.2320797444398632, + "learning_rate": 4.107515051000108e-06, + "loss": 0.1364, + "step": 3063 + }, + { + "epoch": 9.664044059795437, + "grad_norm": 0.237128823718978, + "learning_rate": 4.105516771221528e-06, + "loss": 0.1312, + "step": 3064 + }, + { + "epoch": 9.667191188040913, + "grad_norm": 0.24882016641887955, + "learning_rate": 4.10353718152217e-06, + "loss": 0.1342, + "step": 3065 + }, + { + "epoch": 9.670338316286388, + "grad_norm": 0.23873796840310055, + "learning_rate": 4.1015762839696396e-06, + "loss": 0.1345, + "step": 3066 + }, + { + "epoch": 9.673485444531865, + "grad_norm": 0.2391341419187129, + "learning_rate": 4.099634080612016e-06, + "loss": 0.1324, + "step": 3067 + }, + { + "epoch": 9.67663257277734, + "grad_norm": 0.23289972746538012, + "learning_rate": 4.097710573477852e-06, + "loss": 0.1389, + "step": 3068 + }, + { + "epoch": 9.679779701022817, + "grad_norm": 0.23306239327347605, + "learning_rate": 4.095805764576177e-06, + "loss": 0.1362, + "step": 3069 + }, + { + "epoch": 9.682926829268293, + "grad_norm": 0.24480810822083465, + "learning_rate": 4.093919655896484e-06, + "loss": 0.1278, + "step": 3070 + }, + { + "epoch": 9.686073957513768, + "grad_norm": 0.23785192923431273, + "learning_rate": 4.092052249408746e-06, + "loss": 0.1325, + "step": 3071 + }, + { + "epoch": 9.689221085759245, + "grad_norm": 0.23076825052848043, + "learning_rate": 4.090203547063389e-06, + "loss": 0.136, + "step": 3072 + }, + { + "epoch": 9.69236821400472, + "grad_norm": 0.2339128976885369, + "learning_rate": 4.0883735507913105e-06, + "loss": 0.1324, + "step": 3073 + }, + { + "epoch": 9.695515342250197, + "grad_norm": 0.24873730826363002, + "learning_rate": 4.0865622625038725e-06, + "loss": 0.1305, + "step": 3074 + }, + { + "epoch": 9.698662470495673, + "grad_norm": 0.23691902367088646, + "learning_rate": 4.08476968409289e-06, + "loss": 0.1324, + "step": 3075 + }, + { + "epoch": 9.701809598741148, + "grad_norm": 0.24004736754782502, + "learning_rate": 4.0829958174306435e-06, + "loss": 0.1395, + "step": 3076 + }, + { + "epoch": 9.704956726986625, + "grad_norm": 0.23649363473883206, + "learning_rate": 4.081240664369862e-06, + "loss": 0.1297, + "step": 3077 + }, + { + "epoch": 9.708103855232102, + "grad_norm": 0.2419505558569642, + "learning_rate": 4.079504226743739e-06, + "loss": 0.136, + "step": 3078 + }, + { + "epoch": 9.711250983477576, + "grad_norm": 0.23502797156800156, + "learning_rate": 4.077786506365911e-06, + "loss": 0.1334, + "step": 3079 + }, + { + "epoch": 9.714398111723053, + "grad_norm": 0.21960473813286804, + "learning_rate": 4.076087505030471e-06, + "loss": 0.1356, + "step": 3080 + }, + { + "epoch": 9.717545239968528, + "grad_norm": 0.23767379913931116, + "learning_rate": 4.074407224511955e-06, + "loss": 0.1325, + "step": 3081 + }, + { + "epoch": 9.720692368214005, + "grad_norm": 0.2296867126057468, + "learning_rate": 4.072745666565352e-06, + "loss": 0.1345, + "step": 3082 + }, + { + "epoch": 9.723839496459481, + "grad_norm": 0.23047397670360414, + "learning_rate": 4.071102832926097e-06, + "loss": 0.1302, + "step": 3083 + }, + { + "epoch": 9.726986624704956, + "grad_norm": 0.23998182867200016, + "learning_rate": 4.0694787253100585e-06, + "loss": 0.1338, + "step": 3084 + }, + { + "epoch": 9.730133752950433, + "grad_norm": 0.2323852688580098, + "learning_rate": 4.067873345413555e-06, + "loss": 0.1315, + "step": 3085 + }, + { + "epoch": 9.733280881195908, + "grad_norm": 0.24171313095121516, + "learning_rate": 4.066286694913345e-06, + "loss": 0.1341, + "step": 3086 + }, + { + "epoch": 9.736428009441385, + "grad_norm": 0.22679821098821545, + "learning_rate": 4.064718775466618e-06, + "loss": 0.1269, + "step": 3087 + }, + { + "epoch": 9.739575137686861, + "grad_norm": 0.23502911107275046, + "learning_rate": 4.063169588711004e-06, + "loss": 0.1345, + "step": 3088 + }, + { + "epoch": 9.742722265932336, + "grad_norm": 0.23836190822770764, + "learning_rate": 4.0616391362645715e-06, + "loss": 0.1346, + "step": 3089 + }, + { + "epoch": 9.745869394177813, + "grad_norm": 0.2278799148478252, + "learning_rate": 4.060127419725812e-06, + "loss": 0.1367, + "step": 3090 + }, + { + "epoch": 9.74901652242329, + "grad_norm": 0.2456917732347001, + "learning_rate": 4.058634440673658e-06, + "loss": 0.1326, + "step": 3091 + }, + { + "epoch": 9.752163650668765, + "grad_norm": 0.23836719259607256, + "learning_rate": 4.057160200667464e-06, + "loss": 0.1308, + "step": 3092 + }, + { + "epoch": 9.755310778914241, + "grad_norm": 0.2429286720699004, + "learning_rate": 4.055704701247018e-06, + "loss": 0.1327, + "step": 3093 + }, + { + "epoch": 9.758457907159716, + "grad_norm": 0.23875447802288385, + "learning_rate": 4.05426794393253e-06, + "loss": 0.1314, + "step": 3094 + }, + { + "epoch": 9.761605035405193, + "grad_norm": 0.2278467270000926, + "learning_rate": 4.052849930224636e-06, + "loss": 0.1356, + "step": 3095 + }, + { + "epoch": 9.76475216365067, + "grad_norm": 0.23233849614758936, + "learning_rate": 4.051450661604395e-06, + "loss": 0.1311, + "step": 3096 + }, + { + "epoch": 9.767899291896144, + "grad_norm": 0.23866830169952308, + "learning_rate": 4.0500701395332875e-06, + "loss": 0.1297, + "step": 3097 + }, + { + "epoch": 9.771046420141621, + "grad_norm": 0.23851746320196832, + "learning_rate": 4.0487083654532165e-06, + "loss": 0.1357, + "step": 3098 + }, + { + "epoch": 9.774193548387096, + "grad_norm": 0.2327507724699466, + "learning_rate": 4.047365340786496e-06, + "loss": 0.1338, + "step": 3099 + }, + { + "epoch": 9.777340676632573, + "grad_norm": 0.2293150922887092, + "learning_rate": 4.046041066935868e-06, + "loss": 0.1295, + "step": 3100 + }, + { + "epoch": 9.78048780487805, + "grad_norm": 0.2391233027321985, + "learning_rate": 4.044735545284482e-06, + "loss": 0.1389, + "step": 3101 + }, + { + "epoch": 9.783634933123524, + "grad_norm": 0.23945974117378163, + "learning_rate": 4.043448777195901e-06, + "loss": 0.1381, + "step": 3102 + }, + { + "epoch": 9.786782061369001, + "grad_norm": 0.22786053750451218, + "learning_rate": 4.042180764014107e-06, + "loss": 0.1385, + "step": 3103 + }, + { + "epoch": 9.789929189614476, + "grad_norm": 0.23783743029557908, + "learning_rate": 4.040931507063487e-06, + "loss": 0.1366, + "step": 3104 + }, + { + "epoch": 9.793076317859953, + "grad_norm": 0.24131425400897927, + "learning_rate": 4.039701007648843e-06, + "loss": 0.1336, + "step": 3105 + }, + { + "epoch": 9.79622344610543, + "grad_norm": 0.24372846124327077, + "learning_rate": 4.0384892670553795e-06, + "loss": 0.1383, + "step": 3106 + }, + { + "epoch": 9.799370574350904, + "grad_norm": 0.23973719599414378, + "learning_rate": 4.0372962865487145e-06, + "loss": 0.1326, + "step": 3107 + }, + { + "epoch": 9.802517702596381, + "grad_norm": 0.24009549823324744, + "learning_rate": 4.036122067374869e-06, + "loss": 0.1338, + "step": 3108 + }, + { + "epoch": 9.805664830841856, + "grad_norm": 0.23976142453197466, + "learning_rate": 4.034966610760265e-06, + "loss": 0.1401, + "step": 3109 + }, + { + "epoch": 9.808811959087333, + "grad_norm": 0.22620166570113598, + "learning_rate": 4.033829917911736e-06, + "loss": 0.138, + "step": 3110 + }, + { + "epoch": 9.81195908733281, + "grad_norm": 0.23910385806970655, + "learning_rate": 4.032711990016509e-06, + "loss": 0.1319, + "step": 3111 + }, + { + "epoch": 9.815106215578284, + "grad_norm": 0.22753533073166138, + "learning_rate": 4.031612828242216e-06, + "loss": 0.1338, + "step": 3112 + }, + { + "epoch": 9.818253343823761, + "grad_norm": 0.2421208770020626, + "learning_rate": 4.030532433736889e-06, + "loss": 0.1365, + "step": 3113 + }, + { + "epoch": 9.821400472069238, + "grad_norm": 0.23782192539849972, + "learning_rate": 4.029470807628956e-06, + "loss": 0.1356, + "step": 3114 + }, + { + "epoch": 9.824547600314713, + "grad_norm": 0.23934965568846245, + "learning_rate": 4.028427951027245e-06, + "loss": 0.1358, + "step": 3115 + }, + { + "epoch": 9.82769472856019, + "grad_norm": 0.23281488681008186, + "learning_rate": 4.027403865020977e-06, + "loss": 0.1308, + "step": 3116 + }, + { + "epoch": 9.830841856805664, + "grad_norm": 0.2387332651986393, + "learning_rate": 4.026398550679772e-06, + "loss": 0.1317, + "step": 3117 + }, + { + "epoch": 9.83398898505114, + "grad_norm": 0.24160259965898973, + "learning_rate": 4.025412009053636e-06, + "loss": 0.1364, + "step": 3118 + }, + { + "epoch": 9.837136113296618, + "grad_norm": 0.23512080872741115, + "learning_rate": 4.0244442411729775e-06, + "loss": 0.135, + "step": 3119 + }, + { + "epoch": 9.840283241542092, + "grad_norm": 0.23515000349024553, + "learning_rate": 4.02349524804859e-06, + "loss": 0.1424, + "step": 3120 + }, + { + "epoch": 9.84343036978757, + "grad_norm": 0.24040363807543386, + "learning_rate": 4.02256503067166e-06, + "loss": 0.1326, + "step": 3121 + }, + { + "epoch": 9.846577498033044, + "grad_norm": 0.23796510922943842, + "learning_rate": 4.021653590013759e-06, + "loss": 0.1402, + "step": 3122 + }, + { + "epoch": 9.84972462627852, + "grad_norm": 0.2405813863455342, + "learning_rate": 4.020760927026856e-06, + "loss": 0.1382, + "step": 3123 + }, + { + "epoch": 9.852871754523997, + "grad_norm": 0.2476312470454177, + "learning_rate": 4.019887042643299e-06, + "loss": 0.1308, + "step": 3124 + }, + { + "epoch": 9.856018882769472, + "grad_norm": 0.23636778802712008, + "learning_rate": 4.019031937775827e-06, + "loss": 0.1351, + "step": 3125 + }, + { + "epoch": 9.859166011014949, + "grad_norm": 0.23409257155120913, + "learning_rate": 4.01819561331756e-06, + "loss": 0.1376, + "step": 3126 + }, + { + "epoch": 9.862313139260426, + "grad_norm": 0.234688041297315, + "learning_rate": 4.017378070142011e-06, + "loss": 0.131, + "step": 3127 + }, + { + "epoch": 9.8654602675059, + "grad_norm": 0.23250184458501116, + "learning_rate": 4.016579309103068e-06, + "loss": 0.1312, + "step": 3128 + }, + { + "epoch": 9.868607395751377, + "grad_norm": 0.2334349376577723, + "learning_rate": 4.015799331035007e-06, + "loss": 0.1323, + "step": 3129 + }, + { + "epoch": 9.871754523996852, + "grad_norm": 0.23703297054055375, + "learning_rate": 4.015038136752481e-06, + "loss": 0.1343, + "step": 3130 + }, + { + "epoch": 9.874901652242329, + "grad_norm": 0.24196689257467524, + "learning_rate": 4.01429572705053e-06, + "loss": 0.1355, + "step": 3131 + }, + { + "epoch": 9.878048780487806, + "grad_norm": 0.24673350426408439, + "learning_rate": 4.013572102704572e-06, + "loss": 0.1323, + "step": 3132 + }, + { + "epoch": 9.88119590873328, + "grad_norm": 0.23280087204706737, + "learning_rate": 4.012867264470404e-06, + "loss": 0.1336, + "step": 3133 + }, + { + "epoch": 9.884343036978757, + "grad_norm": 0.24120088645534854, + "learning_rate": 4.0121812130842e-06, + "loss": 0.14, + "step": 3134 + }, + { + "epoch": 9.887490165224232, + "grad_norm": 0.2337805163325715, + "learning_rate": 4.0115139492625134e-06, + "loss": 0.1361, + "step": 3135 + }, + { + "epoch": 9.890637293469709, + "grad_norm": 0.23532202894035434, + "learning_rate": 4.0108654737022755e-06, + "loss": 0.1335, + "step": 3136 + }, + { + "epoch": 9.893784421715186, + "grad_norm": 0.24125197048086924, + "learning_rate": 4.010235787080794e-06, + "loss": 0.1378, + "step": 3137 + }, + { + "epoch": 9.89693154996066, + "grad_norm": 0.24669623748502031, + "learning_rate": 4.00962489005575e-06, + "loss": 0.1326, + "step": 3138 + }, + { + "epoch": 9.900078678206137, + "grad_norm": 0.24343818316183338, + "learning_rate": 4.009032783265204e-06, + "loss": 0.1348, + "step": 3139 + }, + { + "epoch": 9.903225806451612, + "grad_norm": 0.22541573373892404, + "learning_rate": 4.008459467327586e-06, + "loss": 0.1334, + "step": 3140 + }, + { + "epoch": 9.906372934697089, + "grad_norm": 0.24314992873545332, + "learning_rate": 4.007904942841702e-06, + "loss": 0.1333, + "step": 3141 + }, + { + "epoch": 9.909520062942565, + "grad_norm": 0.24444507569036877, + "learning_rate": 4.007369210386732e-06, + "loss": 0.1355, + "step": 3142 + }, + { + "epoch": 9.91266719118804, + "grad_norm": 0.23456707600284918, + "learning_rate": 4.006852270522226e-06, + "loss": 0.1373, + "step": 3143 + }, + { + "epoch": 9.915814319433517, + "grad_norm": 0.22873767694094543, + "learning_rate": 4.006354123788107e-06, + "loss": 0.1382, + "step": 3144 + }, + { + "epoch": 9.918961447678992, + "grad_norm": 0.23735618369859499, + "learning_rate": 4.00587477070467e-06, + "loss": 0.1379, + "step": 3145 + }, + { + "epoch": 9.922108575924469, + "grad_norm": 0.23956072263659126, + "learning_rate": 4.005414211772583e-06, + "loss": 0.1371, + "step": 3146 + }, + { + "epoch": 9.925255704169945, + "grad_norm": 0.23043205951102974, + "learning_rate": 4.004972447472878e-06, + "loss": 0.1327, + "step": 3147 + }, + { + "epoch": 9.92840283241542, + "grad_norm": 0.2344662864402095, + "learning_rate": 4.00454947826696e-06, + "loss": 0.1319, + "step": 3148 + }, + { + "epoch": 9.931549960660897, + "grad_norm": 0.24245229721032507, + "learning_rate": 4.0041453045966055e-06, + "loss": 0.1383, + "step": 3149 + }, + { + "epoch": 9.934697088906374, + "grad_norm": 0.24522019444100665, + "learning_rate": 4.003759926883958e-06, + "loss": 0.1346, + "step": 3150 + }, + { + "epoch": 9.937844217151849, + "grad_norm": 0.23782263325071246, + "learning_rate": 4.003393345531529e-06, + "loss": 0.145, + "step": 3151 + }, + { + "epoch": 9.940991345397325, + "grad_norm": 0.24011271527037592, + "learning_rate": 4.0030455609221975e-06, + "loss": 0.1341, + "step": 3152 + }, + { + "epoch": 9.9441384736428, + "grad_norm": 0.23685295626777536, + "learning_rate": 4.0027165734192115e-06, + "loss": 0.1343, + "step": 3153 + }, + { + "epoch": 9.947285601888277, + "grad_norm": 0.24415877369974173, + "learning_rate": 4.002406383366186e-06, + "loss": 0.1343, + "step": 3154 + }, + { + "epoch": 9.950432730133754, + "grad_norm": 0.23530638296528372, + "learning_rate": 4.0021149910871e-06, + "loss": 0.1344, + "step": 3155 + }, + { + "epoch": 9.953579858379229, + "grad_norm": 0.24560555060327768, + "learning_rate": 4.001842396886302e-06, + "loss": 0.1364, + "step": 3156 + }, + { + "epoch": 9.956726986624705, + "grad_norm": 0.23476225232747105, + "learning_rate": 4.001588601048508e-06, + "loss": 0.1339, + "step": 3157 + }, + { + "epoch": 9.95987411487018, + "grad_norm": 0.2326742437614356, + "learning_rate": 4.0013536038387946e-06, + "loss": 0.138, + "step": 3158 + }, + { + "epoch": 9.963021243115657, + "grad_norm": 0.23396824661309967, + "learning_rate": 4.00113740550261e-06, + "loss": 0.133, + "step": 3159 + }, + { + "epoch": 9.966168371361134, + "grad_norm": 0.23890877834056157, + "learning_rate": 4.000940006265763e-06, + "loss": 0.1362, + "step": 3160 + }, + { + "epoch": 9.969315499606608, + "grad_norm": 0.232863057932829, + "learning_rate": 4.000761406334429e-06, + "loss": 0.1298, + "step": 3161 + }, + { + "epoch": 9.972462627852085, + "grad_norm": 0.24073916528859923, + "learning_rate": 4.000601605895147e-06, + "loss": 0.1449, + "step": 3162 + }, + { + "epoch": 9.975609756097562, + "grad_norm": 0.23431234155479022, + "learning_rate": 4.000460605114827e-06, + "loss": 0.1384, + "step": 3163 + }, + { + "epoch": 9.978756884343037, + "grad_norm": 0.2290325302172204, + "learning_rate": 4.000338404140736e-06, + "loss": 0.1353, + "step": 3164 + }, + { + "epoch": 9.981904012588513, + "grad_norm": 0.2480177908337849, + "learning_rate": 4.00023500310051e-06, + "loss": 0.1325, + "step": 3165 + }, + { + "epoch": 9.985051140833988, + "grad_norm": 0.23707047681989588, + "learning_rate": 4.000150402102143e-06, + "loss": 0.1358, + "step": 3166 + }, + { + "epoch": 9.988198269079465, + "grad_norm": 0.25048005095022025, + "learning_rate": 4.000084601234001e-06, + "loss": 0.1356, + "step": 3167 + }, + { + "epoch": 9.991345397324942, + "grad_norm": 0.24387714398171834, + "learning_rate": 4.000037600564808e-06, + "loss": 0.1329, + "step": 3168 + }, + { + "epoch": 9.994492525570417, + "grad_norm": 0.24315440115509737, + "learning_rate": 4.000009400143658e-06, + "loss": 0.137, + "step": 3169 + }, + { + "epoch": 9.997639653815893, + "grad_norm": 0.22876527267837543, + "learning_rate": 4.000000000000001e-06, + "loss": 0.1403, + "step": 3170 + } + ], + "logging_steps": 1, + "max_steps": 3170, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 634, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0340729464569725e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}